diff --git a/LlamaFactory/.github/ISSUE_TEMPLATE/1-bug-report.yml b/LlamaFactory/.github/ISSUE_TEMPLATE/1-bug-report.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a08596faa5b3be2545412d372f7bdeadca95afb4
--- /dev/null
+++ b/LlamaFactory/.github/ISSUE_TEMPLATE/1-bug-report.yml
@@ -0,0 +1,61 @@
+name: "\U0001F41B Bug / help"
+description: Create a report to help us improve the LLaMA Factory
+labels: ["bug", "pending"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Issues included in **[FAQs](https://github.com/hiyouga/LLaMA-Factory/issues/4614)** or those with **insufficient** information may be closed without a response.
+        已经包含在 **[常见问题](https://github.com/hiyouga/LLaMA-Factory/issues/4614)** 内或提供信息**不完整**的 issues 可能不会被回复。
+
+  - type: markdown
+    attributes:
+      value: |
+        Please do not create issues that are not related to framework bugs under this category, use **[Discussions](https://github.com/hiyouga/LLaMA-Factory/discussions/categories/q-a)** instead.
+        请勿在此分类下创建和框架 bug 无关的 issues，训练问题求助请使用 **[讨论区](https://github.com/hiyouga/LLaMA-Factory/discussions/categories/q-a)**。
+
+  - type: checkboxes
+    id: reminder
+    attributes:
+      label: Reminder
+      description: |
+        Please ensure you have read the above rules carefully and searched the existing issues (including FAQs).
+        请确保您已经认真阅读了上述规则并且搜索过现有的 issues（包括常见问题）。
+
+      options:
+        - label: I have read the above rules and searched the existing issues.
+          required: true
+
+  - type: textarea
+    id: system-info
+    validations:
+      required: true
+    attributes:
+      label: System Info
+      description: |
+        Please share your system info with us. You can run the command **llamafactory-cli env** and copy-paste its output below.
+        请提供您的系统信息。您可以在命令行运行 **llamafactory-cli env** 并将其输出复制到该文本框中。
+
+      placeholder: llamafactory version, platform, python version, ...
+
+  - type: textarea
+    id: reproduction
+    validations:
+      required: true
+    attributes:
+      label: Reproduction
+      description: |
+        Please provide entry arguments, error messages and stack traces that reproduces the problem.
+        请提供入口参数，错误日志以及异常堆栈以便于我们复现问题。
+
+      value: |
+        ```text
+        Put your message here.
+        ```
+
+  - type: textarea
+    id: others
+    validations:
+      required: false
+    attributes:
+      label: Others
diff --git a/LlamaFactory/.github/ISSUE_TEMPLATE/2-feature-request.yml b/LlamaFactory/.github/ISSUE_TEMPLATE/2-feature-request.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5d72271ebc8db3d10bf7e9c6af209e857566bde6
--- /dev/null
+++ b/LlamaFactory/.github/ISSUE_TEMPLATE/2-feature-request.yml
@@ -0,0 +1,41 @@
+name: "\U0001F680 Feature request"
+description: Submit a request for a new feature
+labels: ["enhancement", "pending"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Please do not create issues that are not related to new features under this category.
+        请勿在此分类下创建和新特性无关的 issues。
+
+  - type: checkboxes
+    id: reminder
+    attributes:
+      label: Reminder
+      description: |
+        Please ensure you have read the above rules carefully and searched the existing issues.
+        请确保您已经认真阅读了上述规则并且搜索过现有的 issues。
+
+      options:
+        - label: I have read the above rules and searched the existing issues.
+          required: true
+
+  - type: textarea
+    id: description
+    validations:
+      required: true
+    attributes:
+      label: Description
+      description: |
+        A clear and concise description of the feature proposal.
+        请详细描述您希望加入的新功能特性。
+
+  - type: textarea
+    id: contribution
+    validations:
+      required: false
+    attributes:
+      label: Pull Request
+      description: |
+        Have you already created the relevant PR and submitted the code?
+        您是否已经创建了相关 PR 并提交了代码？
diff --git a/LlamaFactory/.github/ISSUE_TEMPLATE/config.yml b/LlamaFactory/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1a7719634963d9d78bfa5155b51c5a82311084e4
--- /dev/null
+++ b/LlamaFactory/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,8 @@
+blank_issues_enabled: false
+contact_links:
+  - name: 📚 FAQs | 常见问题
+    url: https://github.com/hiyouga/LLaMA-Factory/issues/4614
+    about: Reading in advance is recommended | 建议提前阅读
+  - name: Discussions | 讨论区
+    url: https://github.com/hiyouga/LLaMA-Factory/discussions
+    about: Please ask fine-tuning questions here | 请在这里讨论训练问题
diff --git a/LlamaFactory/.github/workflows/docker.yml b/LlamaFactory/.github/workflows/docker.yml
new file mode 100644
index 0000000000000000000000000000000000000000..fea0a92776530571c7733e70c76216a09aeb4d12
--- /dev/null
+++ b/LlamaFactory/.github/workflows/docker.yml
@@ -0,0 +1,116 @@
+name: docker
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - "main"
+    paths:
+      - "**/*.py"
+      - "pyproject.toml"
+      - "docker/**"
+      - ".github/workflows/*.yml"
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - "**/*.py"
+      - "pyproject.toml"
+      - "docker/**"
+      - ".github/workflows/*.yml"
+  release:
+    types:
+      - published
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - device: "cuda"
+          - device: "npu-a2"
+          - device: "npu-a3"
+
+    runs-on: ubuntu-latest
+
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.device }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+    environment:
+      name: docker
+      url: https://hub.docker.com/r/hiyouga/llamafactory
+
+    steps:
+      - name: Free up disk space
+        uses: jlumbroso/free-disk-space@v1.3.1
+        with:
+          tool-cache: true
+          docker-images: false
+
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Get llamafactory version
+        id: version
+        run: |
+          if [ "${{ github.event_name }}" = "release" ]; then
+            echo "tag=$(grep -oP 'VERSION = "\K[^"]+' src/llamafactory/extras/env.py)" >> "$GITHUB_OUTPUT"
+          else
+            echo "tag=latest" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        if: ${{ github.event_name != 'pull_request' }}
+        uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Login to Quay
+        if: ${{ github.event_name != 'pull_request' && startsWith(matrix.device, 'npu') }}
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ vars.QUAY_ASCEND_USERNAME }}
+          password: ${{ secrets.QUAY_ASCEND_TOKEN }}
+
+      - name: Build and push Docker image (CUDA)
+        if: ${{ matrix.device == 'cuda' }}
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: ./docker/docker-cuda/Dockerfile
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: |
+            docker.io/hiyouga/llamafactory:${{ steps.version.outputs.tag }}
+
+      - name: Build and push Docker image (NPU-A2)
+        if: ${{ matrix.device == 'npu-a2' }}
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          platforms: linux/amd64,linux/arm64
+          file: ./docker/docker-npu/Dockerfile
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: |
+            docker.io/hiyouga/llamafactory:${{ steps.version.outputs.tag }}-npu-a2
+            quay.io/ascend/llamafactory:${{ steps.version.outputs.tag }}-npu-a2
+
+      - name: Build and push Docker image (NPU-A3)
+        if: ${{ matrix.device == 'npu-a3' }}
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          platforms: linux/amd64,linux/arm64
+          file: ./docker/docker-npu/Dockerfile
+          build-args: |
+            BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: |
+            docker.io/hiyouga/llamafactory:${{ steps.version.outputs.tag }}-npu-a3
+            quay.io/ascend/llamafactory:${{ steps.version.outputs.tag }}-npu-a3
diff --git a/LlamaFactory/.github/workflows/publish.yml b/LlamaFactory/.github/workflows/publish.yml
new file mode 100644
index 0000000000000000000000000000000000000000..41cbff65544e4922cfe6a770005467a005d59aa1
--- /dev/null
+++ b/LlamaFactory/.github/workflows/publish.yml
@@ -0,0 +1,37 @@
+name: publish
+
+on:
+  workflow_dispatch:
+  release:
+    types:
+      - published
+
+jobs:
+  publish:
+    name: Upload release to PyPI
+
+    runs-on: ubuntu-latest
+
+    environment:
+      name: release
+      url: https://pypi.org/p/llamafactory
+
+    permissions:
+      id-token: write
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: "3.11"
+          github-token: ${{ github.token }}
+
+      - name: Build package
+        run: |
+          make build
+
+      - name: Publish package
+        uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/LlamaFactory/src/api.py b/LlamaFactory/src/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..61215459ed91c6fa529a719cb9dac57223754d2e
--- /dev/null
+++ b/LlamaFactory/src/api.py
@@ -0,0 +1,33 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import uvicorn
+
+from llamafactory.api.app import create_app
+from llamafactory.chat import ChatModel
+
+
+def main():
+    chat_model = ChatModel()
+    app = create_app(chat_model)
+    api_host = os.getenv("API_HOST", "0.0.0.0")
+    api_port = int(os.getenv("API_PORT", "8000"))
+    print(f"Visit http://localhost:{api_port}/docs for API document.")
+    uvicorn.run(app, host=api_host, port=api_port)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/LlamaFactory/src/llamafactory/__init__.py b/LlamaFactory/src/llamafactory/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1567ef572714881cc464db25d3da3d08a460963
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Efficient fine-tuning of large language models.
+
+Level:
+  api, webui > chat, eval, train > data, model > hparams > extras
+
+Disable version checking: DISABLE_VERSION_CHECK=1
+Enable VRAM recording: RECORD_VRAM=1
+Force using torchrun: FORCE_TORCHRUN=1
+Set logging verbosity: LLAMAFACTORY_VERBOSITY=WARN
+Use modelscope: USE_MODELSCOPE_HUB=1
+Use openmind: USE_OPENMIND_HUB=1
+"""
+
+from .extras.env import VERSION
+
+
+__version__ = VERSION
diff --git a/LlamaFactory/src/llamafactory/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5baa0e919b345df025d2d0fefcaa8bc97dc4b6be
Binary files /dev/null and b/LlamaFactory/src/llamafactory/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/__pycache__/__init__.cpython-312.pyc b/LlamaFactory/src/llamafactory/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a37e0f7ca8fee40ccbdaf994d7137193c67bf576
Binary files /dev/null and b/LlamaFactory/src/llamafactory/__pycache__/__init__.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/__pycache__/cli.cpython-311.pyc b/LlamaFactory/src/llamafactory/__pycache__/cli.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a1b2794395848764e3378aea7114e3f8a979c68
Binary files /dev/null and b/LlamaFactory/src/llamafactory/__pycache__/cli.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/__pycache__/cli.cpython-312.pyc b/LlamaFactory/src/llamafactory/__pycache__/cli.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47f0ae90458bd19e27cede1f0f484bff5b744c67
Binary files /dev/null and b/LlamaFactory/src/llamafactory/__pycache__/cli.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/__pycache__/launcher.cpython-311.pyc b/LlamaFactory/src/llamafactory/__pycache__/launcher.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ac40e601e824808047c861384e2b0ebea9b7ecb
Binary files /dev/null and b/LlamaFactory/src/llamafactory/__pycache__/launcher.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/__pycache__/launcher.cpython-312.pyc b/LlamaFactory/src/llamafactory/__pycache__/launcher.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43ed331273537212e37d6ae392f2fcab6b86d334
Binary files /dev/null and b/LlamaFactory/src/llamafactory/__pycache__/launcher.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/api/__init__.py b/LlamaFactory/src/llamafactory/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/api/__pycache__/common.cpython-311.pyc b/LlamaFactory/src/llamafactory/api/__pycache__/common.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0767dfc39c98ccf34d788e60d27b79aee5caf98
Binary files /dev/null and b/LlamaFactory/src/llamafactory/api/__pycache__/common.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/api/__pycache__/protocol.cpython-311.pyc b/LlamaFactory/src/llamafactory/api/__pycache__/protocol.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7e3d025e8de5e0cdaa964dc4e72782385267def
Binary files /dev/null and b/LlamaFactory/src/llamafactory/api/__pycache__/protocol.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/api/app.py b/LlamaFactory/src/llamafactory/api/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ec0679cb7e053058f52bdbf947cb13e554c5ca8
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/api/app.py
@@ -0,0 +1,133 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+from contextlib import asynccontextmanager
+from functools import partial
+from typing import Annotated
+
+from ..chat import ChatModel
+from ..extras.constants import EngineName
+from ..extras.misc import torch_gc
+from ..extras.packages import is_fastapi_available, is_starlette_available, is_uvicorn_available
+from .chat import (
+    create_chat_completion_response,
+    create_score_evaluation_response,
+    create_stream_chat_completion_response,
+)
+from .protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ModelCard,
+    ModelList,
+    ScoreEvaluationRequest,
+    ScoreEvaluationResponse,
+)
+
+
+if is_fastapi_available():
+    from fastapi import Depends, FastAPI, HTTPException, status
+    from fastapi.middleware.cors import CORSMiddleware
+    from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer
+
+
+if is_starlette_available():
+    from sse_starlette import EventSourceResponse
+
+
+if is_uvicorn_available():
+    import uvicorn
+
+
+async def sweeper() -> None:
+    while True:
+        torch_gc()
+        await asyncio.sleep(300)
+
+
+@asynccontextmanager
+async def lifespan(app: "FastAPI", chat_model: "ChatModel"):  # collects GPU memory
+    if chat_model.engine.name == EngineName.HF:
+        asyncio.create_task(sweeper())
+
+    yield
+    torch_gc()
+
+
+def create_app(chat_model: "ChatModel") -> "FastAPI":
+    root_path = os.getenv("FASTAPI_ROOT_PATH", "")
+    app = FastAPI(lifespan=partial(lifespan, chat_model=chat_model), root_path=root_path)
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    api_key = os.getenv("API_KEY")
+    security = HTTPBearer(auto_error=False)
+
+    async def verify_api_key(auth: Annotated[HTTPAuthorizationCredentials | None, Depends(security)]):
+        if api_key and (auth is None or auth.credentials != api_key):
+            raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key.")
+
+    @app.get(
+        "/v1/models",
+        response_model=ModelList,
+        status_code=status.HTTP_200_OK,
+        dependencies=[Depends(verify_api_key)],
+    )
+    async def list_models():
+        model_card = ModelCard(id=os.getenv("API_MODEL_NAME", "gpt-3.5-turbo"))
+        return ModelList(data=[model_card])
+
+    @app.post(
+        "/v1/chat/completions",
+        response_model=ChatCompletionResponse,
+        status_code=status.HTTP_200_OK,
+        dependencies=[Depends(verify_api_key)],
+    )
+    async def create_chat_completion(request: ChatCompletionRequest):
+        if not chat_model.engine.can_generate:
+            raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
+
+        if request.stream:
+            generate = create_stream_chat_completion_response(request, chat_model)
+            return EventSourceResponse(generate, media_type="text/event-stream", sep="\n")
+        else:
+            return await create_chat_completion_response(request, chat_model)
+
+    @app.post(
+        "/v1/score/evaluation",
+        response_model=ScoreEvaluationResponse,
+        status_code=status.HTTP_200_OK,
+        dependencies=[Depends(verify_api_key)],
+    )
+    async def create_score_evaluation(request: ScoreEvaluationRequest):
+        if chat_model.engine.can_generate:
+            raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
+
+        return await create_score_evaluation_response(request, chat_model)
+
+    return app
+
+
+def run_api() -> None:
+    chat_model = ChatModel()
+    app = create_app(chat_model)
+    api_host = os.getenv("API_HOST", "0.0.0.0")
+    api_port = int(os.getenv("API_PORT", "8000"))
+    print(f"Visit http://localhost:{api_port}/docs for API document.")
+    uvicorn.run(app, host=api_host, port=api_port)
diff --git a/LlamaFactory/src/llamafactory/api/chat.py b/LlamaFactory/src/llamafactory/api/chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..93236c5ca865492f0c45e1f5ab56a389875350ea
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/api/chat.py
@@ -0,0 +1,291 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import io
+import json
+import os
+import re
+import uuid
+from collections.abc import AsyncGenerator
+from typing import TYPE_CHECKING, Optional
+
+from ..data import Role as DataRole
+from ..extras import logging
+from ..extras.constants import AUDIO_PLACEHOLDER, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER
+from ..extras.misc import is_env_enabled
+from ..extras.packages import is_fastapi_available, is_pillow_available, is_requests_available
+from .common import check_lfi_path, check_ssrf_url, dictify, jsonify
+from .protocol import (
+    ChatCompletionMessage,
+    ChatCompletionResponse,
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseUsage,
+    ChatCompletionStreamResponse,
+    ChatCompletionStreamResponseChoice,
+    Finish,
+    Function,
+    FunctionCall,
+    Role,
+    ScoreEvaluationResponse,
+)
+
+
+if is_fastapi_available():
+    from fastapi import HTTPException, status
+
+
+if is_pillow_available():
+    from PIL import Image
+
+
+if is_requests_available():
+    import requests
+
+
+if TYPE_CHECKING:
+    from ..chat import ChatModel
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
+    from .protocol import ChatCompletionRequest, ScoreEvaluationRequest
+
+
+logger = logging.get_logger(__name__)
+ROLE_MAPPING = {
+    Role.USER: DataRole.USER.value,
+    Role.ASSISTANT: DataRole.ASSISTANT.value,
+    Role.SYSTEM: DataRole.SYSTEM.value,
+    Role.FUNCTION: DataRole.FUNCTION.value,
+    Role.TOOL: DataRole.OBSERVATION.value,
+}
+
+
+def _process_request(
+    request: "ChatCompletionRequest",
+) -> tuple[
+    list[dict[str, str]],
+    Optional[str],
+    Optional[str],
+    Optional[list["ImageInput"]],
+    Optional[list["VideoInput"]],
+    Optional[list["AudioInput"]],
+]:
+    if is_env_enabled("API_VERBOSE", "1"):
+        logger.info_rank0(f"==== request ====\n{json.dumps(dictify(request), indent=2, ensure_ascii=False)}")
+
+    if len(request.messages) == 0:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
+
+    if request.messages[0].role == Role.SYSTEM:
+        content = request.messages.pop(0).content
+        system = content[0].text if isinstance(content, list) else content
+    else:
+        system = None
+
+    if len(request.messages) % 2 == 0:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only supports u/a/u/a/u...")
+
+    input_messages = []
+    images, videos, audios = [], [], []
+    for i, message in enumerate(request.messages):
+        if i % 2 == 0 and message.role not in [Role.USER, Role.TOOL]:
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
+        elif i % 2 == 1 and message.role not in [Role.ASSISTANT, Role.FUNCTION]:
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
+
+        if message.role == Role.ASSISTANT and isinstance(message.tool_calls, list) and len(message.tool_calls):
+            tool_calls = [
+                {"name": tool_call.function.name, "arguments": tool_call.function.arguments}
+                for tool_call in message.tool_calls
+            ]
+            content = json.dumps(tool_calls, ensure_ascii=False)
+            input_messages.append({"role": ROLE_MAPPING[Role.FUNCTION], "content": content})
+        elif isinstance(message.content, list):
+            text_content = ""
+            for input_item in message.content:
+                if input_item.type == "text":
+                    text_content += input_item.text
+                elif input_item.type == "image_url":
+                    text_content += IMAGE_PLACEHOLDER
+                    image_url = input_item.image_url.url
+                    if re.match(r"^data:image\/(png|jpg|jpeg|gif|bmp);base64,(.+)$", image_url):  # base64 image
+                        image_stream = io.BytesIO(base64.b64decode(image_url.split(",", maxsplit=1)[1]))
+                    elif os.path.isfile(image_url):  # local file
+                        check_lfi_path(image_url)
+                        image_stream = open(image_url, "rb")
+                    else:  # web uri
+                        check_ssrf_url(image_url)
+                        image_stream = requests.get(image_url, stream=True).raw
+
+                    images.append(Image.open(image_stream).convert("RGB"))
+                elif input_item.type == "video_url":
+                    text_content += VIDEO_PLACEHOLDER
+                    video_url = input_item.video_url.url
+                    if re.match(r"^data:video\/(mp4|mkv|avi|mov);base64,(.+)$", video_url):  # base64 video
+                        video_stream = io.BytesIO(base64.b64decode(video_url.split(",", maxsplit=1)[1]))
+                    elif os.path.isfile(video_url):  # local file
+                        check_lfi_path(video_url)
+                        video_stream = video_url
+                    else:  # web uri
+                        check_ssrf_url(video_url)
+                        video_stream = requests.get(video_url, stream=True).raw
+
+                    videos.append(video_stream)
+                elif input_item.type == "audio_url":
+                    text_content += AUDIO_PLACEHOLDER
+                    audio_url = input_item.audio_url.url
+                    if re.match(r"^data:audio\/(mpeg|mp3|wav|ogg);base64,(.+)$", audio_url):  # base64 audio
+                        audio_stream = io.BytesIO(base64.b64decode(audio_url.split(",", maxsplit=1)[1]))
+                    elif os.path.isfile(audio_url):  # local file
+                        check_lfi_path(audio_url)
+                        audio_stream = audio_url
+                    else:  # web uri
+                        check_ssrf_url(audio_url)
+                        audio_stream = requests.get(audio_url, stream=True).raw
+
+                    audios.append(audio_stream)
+                else:
+                    raise HTTPException(
+                        status_code=status.HTTP_400_BAD_REQUEST, detail=f"Invalid input type {input_item.type}."
+                    )
+
+            input_messages.append({"role": ROLE_MAPPING[message.role], "content": text_content})
+        else:
+            input_messages.append({"role": ROLE_MAPPING[message.role], "content": message.content})
+
+    tool_list = request.tools
+    if isinstance(tool_list, list) and len(tool_list):
+        try:
+            tools = json.dumps([dictify(tool.function) for tool in tool_list], ensure_ascii=False)
+        except json.JSONDecodeError:
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
+    else:
+        tools = None
+
+    return input_messages, system, tools, images or None, videos or None, audios or None
+
+
+def _create_stream_chat_completion_chunk(
+    completion_id: str,
+    model: str,
+    delta: "ChatCompletionMessage",
+    index: Optional[int] = 0,
+    finish_reason: Optional["Finish"] = None,
+) -> str:
+    choice_data = ChatCompletionStreamResponseChoice(index=index, delta=delta, finish_reason=finish_reason)
+    chunk = ChatCompletionStreamResponse(id=completion_id, model=model, choices=[choice_data])
+    return jsonify(chunk)
+
+
+async def create_chat_completion_response(
+    request: "ChatCompletionRequest", chat_model: "ChatModel"
+) -> "ChatCompletionResponse":
+    completion_id = f"chatcmpl-{uuid.uuid4().hex}"
+    input_messages, system, tools, images, videos, audios = _process_request(request)
+    responses = await chat_model.achat(
+        input_messages,
+        system,
+        tools,
+        images,
+        videos,
+        audios,
+        do_sample=request.do_sample,
+        temperature=request.temperature,
+        top_p=request.top_p,
+        max_new_tokens=request.max_tokens,
+        num_return_sequences=request.n,
+        repetition_penalty=request.presence_penalty,
+        stop=request.stop,
+    )
+
+    prompt_length, response_length = 0, 0
+    choices = []
+    for i, response in enumerate(responses):
+        if tools:
+            result = chat_model.engine.template.extract_tool(response.response_text)
+        else:
+            result = response.response_text
+
+        if isinstance(result, list):
+            tool_calls = []
+            for tool in result:
+                function = Function(name=tool.name, arguments=tool.arguments)
+                tool_calls.append(FunctionCall(id=f"call_{uuid.uuid4().hex}", function=function))
+
+            response_message = ChatCompletionMessage(role=Role.ASSISTANT, tool_calls=tool_calls)
+            finish_reason = Finish.TOOL
+        else:
+            response_message = ChatCompletionMessage(role=Role.ASSISTANT, content=result)
+            finish_reason = Finish.STOP if response.finish_reason == "stop" else Finish.LENGTH
+
+        choices.append(ChatCompletionResponseChoice(index=i, message=response_message, finish_reason=finish_reason))
+        prompt_length = response.prompt_length
+        response_length += response.response_length
+
+    usage = ChatCompletionResponseUsage(
+        prompt_tokens=prompt_length,
+        completion_tokens=response_length,
+        total_tokens=prompt_length + response_length,
+    )
+
+    return ChatCompletionResponse(id=completion_id, model=request.model, choices=choices, usage=usage)
+
+
+async def create_stream_chat_completion_response(
+    request: "ChatCompletionRequest", chat_model: "ChatModel"
+) -> AsyncGenerator[str, None]:
+    completion_id = f"chatcmpl-{uuid.uuid4().hex}"
+    input_messages, system, tools, images, videos, audios = _process_request(request)
+    if tools:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream function calls.")
+
+    if request.n > 1:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream multiple responses.")
+
+    yield _create_stream_chat_completion_chunk(
+        completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(role=Role.ASSISTANT, content="")
+    )
+    async for new_token in chat_model.astream_chat(
+        input_messages,
+        system,
+        tools,
+        images,
+        videos,
+        audios,
+        do_sample=request.do_sample,
+        temperature=request.temperature,
+        top_p=request.top_p,
+        max_new_tokens=request.max_tokens,
+        repetition_penalty=request.presence_penalty,
+        stop=request.stop,
+    ):
+        if len(new_token) != 0:
+            yield _create_stream_chat_completion_chunk(
+                completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(content=new_token)
+            )
+
+    yield _create_stream_chat_completion_chunk(
+        completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(), finish_reason=Finish.STOP
+    )
+    yield "[DONE]"
+
+
+async def create_score_evaluation_response(
+    request: "ScoreEvaluationRequest", chat_model: "ChatModel"
+) -> "ScoreEvaluationResponse":
+    score_id = f"scoreval-{uuid.uuid4().hex}"
+    if len(request.messages) == 0:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request")
+
+    scores = await chat_model.aget_scores(request.messages, max_length=request.max_length)
+    return ScoreEvaluationResponse(id=score_id, model=request.model, scores=scores)
diff --git a/LlamaFactory/src/llamafactory/api/common.py b/LlamaFactory/src/llamafactory/api/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b4e9602de7ebc10b4f15c68ad9167cb9d80d8ef
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/api/common.py
@@ -0,0 +1,96 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ipaddress
+import json
+import os
+import socket
+from typing import TYPE_CHECKING, Any
+from urllib.parse import urlparse
+
+from ..extras.misc import is_env_enabled
+from ..extras.packages import is_fastapi_available
+
+
+if is_fastapi_available():
+    from fastapi import HTTPException, status
+
+
+if TYPE_CHECKING:
+    from pydantic import BaseModel
+
+
+SAFE_MEDIA_PATH = os.environ.get("SAFE_MEDIA_PATH", os.path.join(os.path.dirname(__file__), "safe_media"))
+ALLOW_LOCAL_FILES = is_env_enabled("ALLOW_LOCAL_FILES", "1")
+
+
+def dictify(data: "BaseModel") -> dict[str, Any]:
+    try:  # pydantic v2
+        return data.model_dump(exclude_unset=True)
+    except AttributeError:  # pydantic v1
+        return data.dict(exclude_unset=True)
+
+
+def jsonify(data: "BaseModel") -> str:
+    try:  # pydantic v2
+        return json.dumps(data.model_dump(exclude_unset=True), ensure_ascii=False)
+    except AttributeError:  # pydantic v1
+        return data.json(exclude_unset=True, ensure_ascii=False)
+
+
+def check_lfi_path(path: str) -> None:
+    """Checks if a given path is vulnerable to LFI. Raises HTTPException if unsafe."""
+    if not ALLOW_LOCAL_FILES:
+        raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Local file access is disabled.")
+
+    try:
+        os.makedirs(SAFE_MEDIA_PATH, exist_ok=True)
+        real_path = os.path.realpath(path)
+        safe_path = os.path.realpath(SAFE_MEDIA_PATH)
+
+        if not real_path.startswith(safe_path):
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN, detail="File access is restricted to the safe media directory."
+            )
+    except Exception:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid or inaccessible file path.")
+
+
+def check_ssrf_url(url: str) -> None:
+    """Checks if a given URL is vulnerable to SSRF. Raises HTTPException if unsafe."""
+    try:
+        parsed_url = urlparse(url)
+        if parsed_url.scheme not in ["http", "https"]:
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only HTTP/HTTPS URLs are allowed.")
+
+        hostname = parsed_url.hostname
+        if not hostname:
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid URL hostname.")
+
+        ip_info = socket.getaddrinfo(hostname, parsed_url.port)
+        ip_address_str = ip_info[0][4][0]
+        ip = ipaddress.ip_address(ip_address_str)
+
+        if not ip.is_global:
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Access to private or reserved IP addresses is not allowed.",
+            )
+
+    except socket.gaierror:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST, detail=f"Could not resolve hostname: {parsed_url.hostname}"
+        )
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Invalid URL: {e}")
diff --git a/LlamaFactory/src/llamafactory/api/protocol.py b/LlamaFactory/src/llamafactory/api/protocol.py
new file mode 100644
index 0000000000000000000000000000000000000000..675523f062316f3e332d13884e7322aa60050905
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/api/protocol.py
@@ -0,0 +1,156 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from enum import Enum, unique
+from typing import Any, Literal
+
+from pydantic import BaseModel, Field
+
+
+@unique
+class Role(str, Enum):
+    USER = "user"
+    ASSISTANT = "assistant"
+    SYSTEM = "system"
+    FUNCTION = "function"
+    TOOL = "tool"
+
+
+@unique
+class Finish(str, Enum):
+    STOP = "stop"
+    LENGTH = "length"
+    TOOL = "tool_calls"
+
+
+class ModelCard(BaseModel):
+    id: str
+    object: Literal["model"] = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: Literal["owner"] = "owner"
+
+
+class ModelList(BaseModel):
+    object: Literal["list"] = "list"
+    data: list[ModelCard] = []
+
+
+class Function(BaseModel):
+    name: str
+    arguments: str
+
+
+class FunctionDefinition(BaseModel):
+    name: str
+    description: str
+    parameters: dict[str, Any]
+
+
+class FunctionAvailable(BaseModel):
+    type: Literal["function", "code_interpreter"] = "function"
+    function: FunctionDefinition | None = None
+
+
+class FunctionCall(BaseModel):
+    id: str
+    type: Literal["function"] = "function"
+    function: Function
+
+
+class URL(BaseModel):
+    url: str
+    detail: Literal["auto", "low", "high"] = "auto"
+
+
+class MultimodalInputItem(BaseModel):
+    type: Literal["text", "image_url", "video_url", "audio_url"]
+    text: str | None = None
+    image_url: URL | None = None
+    video_url: URL | None = None
+    audio_url: URL | None = None
+
+
+class ChatMessage(BaseModel):
+    role: Role
+    content: str | list[MultimodalInputItem] | None = None
+    tool_calls: list[FunctionCall] | None = None
+
+
+class ChatCompletionMessage(BaseModel):
+    role: Role | None = None
+    content: str | None = None
+    tool_calls: list[FunctionCall] | None = None
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: list[ChatMessage]
+    tools: list[FunctionAvailable] | None = None
+    do_sample: bool | None = None
+    temperature: float | None = None
+    top_p: float | None = None
+    n: int = 1
+    presence_penalty: float | None = None
+    max_tokens: int | None = None
+    stop: str | list[str] | None = None
+    stream: bool = False
+
+
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatCompletionMessage
+    finish_reason: Finish
+
+
+class ChatCompletionStreamResponseChoice(BaseModel):
+    index: int
+    delta: ChatCompletionMessage
+    finish_reason: Finish | None = None
+
+
+class ChatCompletionResponseUsage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: Literal["chat.completion"] = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[ChatCompletionResponseChoice]
+    usage: ChatCompletionResponseUsage
+
+
+class ChatCompletionStreamResponse(BaseModel):
+    id: str
+    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[ChatCompletionStreamResponseChoice]
+
+
+class ScoreEvaluationRequest(BaseModel):
+    model: str
+    messages: list[str]
+    max_length: int | None = None
+
+
+class ScoreEvaluationResponse(BaseModel):
+    id: str
+    object: Literal["score.evaluation"] = "score.evaluation"
+    model: str
+    scores: list[float]
diff --git a/LlamaFactory/src/llamafactory/chat/__init__.py b/LlamaFactory/src/llamafactory/chat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..15d8b9ba2d77d6f300d59300da5a49abd3ed4e57
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/chat/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base_engine import BaseEngine
+from .chat_model import ChatModel
+
+
+__all__ = ["BaseEngine", "ChatModel"]
diff --git a/LlamaFactory/src/llamafactory/chat/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/chat/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cee02d9d1a045f8a54416a215595cbb48e5816f
Binary files /dev/null and b/LlamaFactory/src/llamafactory/chat/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/chat/__pycache__/__init__.cpython-312.pyc b/LlamaFactory/src/llamafactory/chat/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3da5891b54cec08c062fe652998947ccaf1fcaa
Binary files /dev/null and b/LlamaFactory/src/llamafactory/chat/__pycache__/__init__.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/chat/__pycache__/base_engine.cpython-311.pyc b/LlamaFactory/src/llamafactory/chat/__pycache__/base_engine.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eaa3ffce5a899d999629eb9bdf1ad3b17e564ba4
Binary files /dev/null and b/LlamaFactory/src/llamafactory/chat/__pycache__/base_engine.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/chat/__pycache__/base_engine.cpython-312.pyc b/LlamaFactory/src/llamafactory/chat/__pycache__/base_engine.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d40dd46f9441465373d3735d1ca171547a729d0
Binary files /dev/null and b/LlamaFactory/src/llamafactory/chat/__pycache__/base_engine.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/chat/__pycache__/chat_model.cpython-311.pyc b/LlamaFactory/src/llamafactory/chat/__pycache__/chat_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4008b23e56e416e6e7bacc42f2ba4afb375fcc7b
Binary files /dev/null and b/LlamaFactory/src/llamafactory/chat/__pycache__/chat_model.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/chat/__pycache__/chat_model.cpython-312.pyc b/LlamaFactory/src/llamafactory/chat/__pycache__/chat_model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df0f0ea481e3dce319c80ac645d08ef82d1b9243
Binary files /dev/null and b/LlamaFactory/src/llamafactory/chat/__pycache__/chat_model.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/chat/__pycache__/hf_engine.cpython-311.pyc b/LlamaFactory/src/llamafactory/chat/__pycache__/hf_engine.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..991ca0f69dfe2b759c640660ad7fb4a2f6d2e08f
Binary files /dev/null and b/LlamaFactory/src/llamafactory/chat/__pycache__/hf_engine.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/chat/__pycache__/hf_engine.cpython-312.pyc b/LlamaFactory/src/llamafactory/chat/__pycache__/hf_engine.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6b47bbaa4dcec453edf6eae8fc3e065361dfa1b
Binary files /dev/null and b/LlamaFactory/src/llamafactory/chat/__pycache__/hf_engine.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/chat/base_engine.py b/LlamaFactory/src/llamafactory/chat/base_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d497c1ae927f94f396c18833b18cdb894cbd59d
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/chat/base_engine.py
@@ -0,0 +1,98 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from collections.abc import AsyncGenerator
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Literal, Optional, Union
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, PreTrainedTokenizer
+    from vllm import AsyncLLMEngine
+
+    from ..data import Template
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
+    from ..extras.constants import EngineName
+    from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+@dataclass
+class Response:
+    response_text: str
+    response_length: int
+    prompt_length: int
+    finish_reason: Literal["stop", "length"]
+
+
+class BaseEngine(ABC):
+    r"""Base class for inference engine of chat models.
+
+    Must implements async methods: chat(), stream_chat() and get_scores().
+    """
+
+    name: "EngineName"
+    model: Union["PreTrainedModel", "AsyncLLMEngine"]
+    tokenizer: "PreTrainedTokenizer"
+    can_generate: bool
+    template: "Template"
+    generating_args: dict[str, Any]
+
+    @abstractmethod
+    def __init__(
+        self,
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+    ) -> None:
+        r"""Initialize an inference engine."""
+        ...
+
+    @abstractmethod
+    async def chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> list["Response"]:
+        r"""Get a list of responses of the chat model."""
+        ...
+
+    @abstractmethod
+    async def stream_chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncGenerator[str, None]:
+        r"""Get the response token-by-token of the chat model."""
+        ...
+
+    @abstractmethod
+    async def get_scores(
+        self,
+        batch_input: list[str],
+        **input_kwargs,
+    ) -> list[float]:
+        r"""Get a list of scores of the reward model."""
+        ...
diff --git a/LlamaFactory/src/llamafactory/chat/chat_model.py b/LlamaFactory/src/llamafactory/chat/chat_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb612f88d468d76f06eefa45b96c1bfa0351fa7c
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/chat/chat_model.py
@@ -0,0 +1,210 @@
+# Copyright 2025 THUDM and the LlamaFactory team.
+#
+# This code is inspired by the THUDM's ChatGLM implementation.
+# https://github.com/THUDM/ChatGLM-6B/blob/main/cli_demo.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+from collections.abc import AsyncGenerator, Generator
+from threading import Thread
+from typing import TYPE_CHECKING, Any, Optional
+
+from ..extras.constants import EngineName
+from ..extras.misc import torch_gc
+from ..hparams import get_infer_args
+
+
+if TYPE_CHECKING:
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
+    from .base_engine import BaseEngine, Response
+
+
+def _start_background_loop(loop: "asyncio.AbstractEventLoop") -> None:
+    asyncio.set_event_loop(loop)
+    loop.run_forever()
+
+
+class ChatModel:
+    r"""General class for chat models. Backed by huggingface or vllm engines.
+
+    Supports both sync and async methods.
+    Sync methods: chat(), stream_chat() and get_scores().
+    Async methods: achat(), astream_chat() and aget_scores().
+    """
+
+    def __init__(self, args: Optional[dict[str, Any]] = None) -> None:
+        model_args, data_args, finetuning_args, generating_args = get_infer_args(args)
+
+        if model_args.infer_backend == EngineName.HF:
+            from .hf_engine import HuggingfaceEngine
+
+            self.engine: BaseEngine = HuggingfaceEngine(model_args, data_args, finetuning_args, generating_args)
+        elif model_args.infer_backend == EngineName.VLLM:
+            try:
+                from .vllm_engine import VllmEngine
+
+                self.engine: BaseEngine = VllmEngine(model_args, data_args, finetuning_args, generating_args)
+            except ImportError as e:
+                raise ImportError(
+                    "vLLM not install, you may need to run `pip install vllm`\n"
+                    "or try to use HuggingFace backend: --infer_backend huggingface"
+                ) from e
+        elif model_args.infer_backend == EngineName.SGLANG:
+            try:
+                from .sglang_engine import SGLangEngine
+
+                self.engine: BaseEngine = SGLangEngine(model_args, data_args, finetuning_args, generating_args)
+            except ImportError as e:
+                raise ImportError(
+                    "SGLang not install, you may need to run `pip install sglang[all]`\n"
+                    "or try to use HuggingFace backend: --infer_backend huggingface"
+                ) from e
+        elif model_args.infer_backend == EngineName.KT:
+            try:
+                from .kt_engine import KTransformersEngine
+
+                self.engine: BaseEngine = KTransformersEngine(model_args, data_args, finetuning_args, generating_args)
+            except ImportError as e:
+                raise ImportError(
+                    "KTransformers not install, you may need to run `pip install ktransformers`\n"
+                    "or try to use HuggingFace backend: --infer_backend huggingface"
+                ) from e
+        else:
+            raise NotImplementedError(f"Unknown backend: {model_args.infer_backend}")
+
+        self._loop = asyncio.new_event_loop()
+        self._thread = Thread(target=_start_background_loop, args=(self._loop,), daemon=True)
+        self._thread.start()
+
+    def chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> list["Response"]:
+        r"""Get a list of responses of the chat model."""
+        task = asyncio.run_coroutine_threadsafe(
+            self.achat(messages, system, tools, images, videos, audios, **input_kwargs), self._loop
+        )
+        return task.result()
+
+    async def achat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> list["Response"]:
+        r"""Asynchronously get a list of responses of the chat model."""
+        return await self.engine.chat(messages, system, tools, images, videos, audios, **input_kwargs)
+
+    def stream_chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> Generator[str, None, None]:
+        r"""Get the response token-by-token of the chat model."""
+        generator = self.astream_chat(messages, system, tools, images, videos, audios, **input_kwargs)
+        while True:
+            try:
+                task = asyncio.run_coroutine_threadsafe(generator.__anext__(), self._loop)
+                yield task.result()
+            except StopAsyncIteration:
+                break
+
+    async def astream_chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncGenerator[str, None]:
+        r"""Asynchronously get the response token-by-token of the chat model."""
+        async for new_token in self.engine.stream_chat(
+            messages, system, tools, images, videos, audios, **input_kwargs
+        ):
+            yield new_token
+
+    def get_scores(
+        self,
+        batch_input: list[str],
+        **input_kwargs,
+    ) -> list[float]:
+        r"""Get a list of scores of the reward model."""
+        task = asyncio.run_coroutine_threadsafe(self.aget_scores(batch_input, **input_kwargs), self._loop)
+        return task.result()
+
+    async def aget_scores(
+        self,
+        batch_input: list[str],
+        **input_kwargs,
+    ) -> list[float]:
+        r"""Asynchronously get a list of scores of the reward model."""
+        return await self.engine.get_scores(batch_input, **input_kwargs)
+
+
+def run_chat() -> None:
+    if os.name != "nt":
+        try:
+            import readline  # noqa: F401
+        except ImportError:
+            print("Install `readline` for a better experience.")
+
+    chat_model = ChatModel()
+    messages = []
+    print("Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application.")
+
+    while True:
+        try:
+            query = input("\nUser: ")
+        except UnicodeDecodeError:
+            print("Detected decoding error at the inputs, please set the terminal encoding to utf-8.")
+            continue
+        except Exception:
+            raise
+
+        if query.strip() == "exit":
+            break
+
+        if query.strip() == "clear":
+            messages = []
+            torch_gc()
+            print("History has been removed.")
+            continue
+
+        messages.append({"role": "user", "content": query})
+        print("Assistant: ", end="", flush=True)
+
+        response = ""
+        for new_text in chat_model.stream_chat(messages):
+            print(new_text, end="", flush=True)
+            response += new_text
+        print()
+        messages.append({"role": "assistant", "content": response})
diff --git a/LlamaFactory/src/llamafactory/chat/hf_engine.py b/LlamaFactory/src/llamafactory/chat/hf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e670b92c99e3d50362184fdd690cd372fe033d6
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/chat/hf_engine.py
@@ -0,0 +1,412 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+from collections.abc import AsyncGenerator, Callable
+from threading import Thread
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import torch
+from transformers import GenerationConfig, TextIteratorStreamer
+from typing_extensions import override
+
+from ..data import get_template_and_fix_tokenizer
+from ..extras import logging
+from ..extras.constants import AUDIO_PLACEHOLDER, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER, EngineName
+from ..model import load_model, load_tokenizer
+from .base_engine import BaseEngine, Response
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
+    from trl import PreTrainedModelWrapper
+
+    from ..data import Template
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
+    from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class HuggingfaceEngine(BaseEngine):
+    def __init__(
+        self,
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+    ) -> None:
+        self.name = EngineName.HF
+        self.can_generate = finetuning_args.stage == "sft"
+        tokenizer_module = load_tokenizer(model_args)
+        self.tokenizer = tokenizer_module["tokenizer"]
+        self.processor = tokenizer_module["processor"]
+        self.tokenizer.padding_side = "left" if self.can_generate else "right"
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args)
+        self.model = load_model(
+            self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
+        )  # must after fixing tokenizer to resize vocab
+        self.generating_args = generating_args.to_dict()
+        try:
+            asyncio.get_event_loop()
+        except RuntimeError:
+            logger.warning_rank0_once("There is no current event loop, creating a new one.")
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
+        self.semaphore = asyncio.Semaphore(int(os.getenv("MAX_CONCURRENT", "1")))
+
+    @staticmethod
+    def _process_args(
+        model: "PreTrainedModel",
+        tokenizer: "PreTrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
+        template: "Template",
+        generating_args: dict[str, Any],
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        input_kwargs: Optional[dict[str, Any]] = {},
+    ) -> tuple[dict[str, Any], int]:
+        mm_input_dict = {"images": [], "videos": [], "audios": [], "imglens": [0], "vidlens": [0], "audlens": [0]}
+        if images is not None:
+            mm_input_dict.update({"images": images, "imglens": [len(images)]})
+            if not any(IMAGE_PLACEHOLDER in message["content"] for message in messages):
+                messages[0]["content"] = IMAGE_PLACEHOLDER * len(images) + messages[0]["content"]
+
+        if videos is not None:
+            mm_input_dict.update({"videos": videos, "vidlens": [len(videos)]})
+            if not any(VIDEO_PLACEHOLDER in message["content"] for message in messages):
+                messages[0]["content"] = VIDEO_PLACEHOLDER * len(videos) + messages[0]["content"]
+
+        if audios is not None:
+            mm_input_dict.update({"audios": audios, "audlens": [len(audios)]})
+            if not any(AUDIO_PLACEHOLDER in message["content"] for message in messages):
+                messages[0]["content"] = AUDIO_PLACEHOLDER * len(audios) + messages[0]["content"]
+
+        messages = template.mm_plugin.process_messages(
+            messages, mm_input_dict["images"], mm_input_dict["videos"], mm_input_dict["audios"], processor
+        )
+        paired_messages = messages + [{"role": "assistant", "content": ""}]
+        prompt_ids, _ = template.encode_oneturn(tokenizer, paired_messages, system, tools)
+        prompt_ids, _ = template.mm_plugin.process_token_ids(
+            prompt_ids,
+            None,
+            mm_input_dict["images"],
+            mm_input_dict["videos"],
+            mm_input_dict["audios"],
+            tokenizer,
+            processor,
+        )
+        prompt_length = len(prompt_ids)
+        inputs = torch.tensor([prompt_ids], device=model.device)
+        attention_mask = torch.ones_like(inputs, dtype=torch.long)
+
+        do_sample: Optional[bool] = input_kwargs.pop("do_sample", None)
+        temperature: Optional[float] = input_kwargs.pop("temperature", None)
+        top_p: Optional[float] = input_kwargs.pop("top_p", None)
+        top_k: Optional[float] = input_kwargs.pop("top_k", None)
+        num_return_sequences: int = input_kwargs.pop("num_return_sequences", 1)
+        repetition_penalty: Optional[float] = input_kwargs.pop("repetition_penalty", None)
+        length_penalty: Optional[float] = input_kwargs.pop("length_penalty", None)
+        skip_special_tokens: Optional[bool] = input_kwargs.pop("skip_special_tokens", None)
+        max_length: Optional[int] = input_kwargs.pop("max_length", None)
+        max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None)
+        stop: Optional[Union[str, list[str]]] = input_kwargs.pop("stop", None)
+
+        if stop is not None:
+            logger.warning_rank0("Stop parameter is not supported by the huggingface engine yet.")
+
+        generating_args = generating_args.copy()
+        generating_args.update(
+            dict(
+                do_sample=do_sample if do_sample is not None else generating_args["do_sample"],
+                temperature=temperature if temperature is not None else generating_args["temperature"],
+                top_p=top_p if top_p is not None else generating_args["top_p"],
+                top_k=top_k if top_k is not None else generating_args["top_k"],
+                num_return_sequences=num_return_sequences,
+                repetition_penalty=repetition_penalty
+                if repetition_penalty is not None
+                else generating_args["repetition_penalty"],
+                length_penalty=length_penalty if length_penalty is not None else generating_args["length_penalty"],
+                skip_special_tokens=skip_special_tokens
+                if skip_special_tokens is not None
+                else generating_args["skip_special_tokens"],
+                eos_token_id=template.get_stop_token_ids(tokenizer),
+                pad_token_id=tokenizer.pad_token_id,
+            )
+        )
+
+        if isinstance(num_return_sequences, int) and num_return_sequences > 1:  # do_sample needs temperature > 0
+            generating_args["do_sample"] = True
+            generating_args["temperature"] = generating_args["temperature"] or 1.0
+
+        if not generating_args["temperature"]:
+            generating_args["do_sample"] = False
+
+        if not generating_args["do_sample"]:
+            generating_args.pop("temperature", None)
+            generating_args.pop("top_p", None)
+
+        if max_length:
+            generating_args.pop("max_new_tokens", None)
+            generating_args["max_length"] = max_length
+
+        if max_new_tokens:
+            generating_args.pop("max_length", None)
+            generating_args["max_new_tokens"] = max_new_tokens
+
+        gen_kwargs = dict(
+            inputs=inputs,
+            attention_mask=attention_mask,
+            generation_config=GenerationConfig(**generating_args),
+        )
+
+        mm_inputs = template.mm_plugin.get_mm_inputs(**mm_input_dict, batch_ids=[prompt_ids], processor=processor)
+        for key, value in mm_inputs.items():
+            if isinstance(value, list) and isinstance(value[0], torch.Tensor):  # for pixtral inputs
+                value = torch.stack(value)  # assume they have same sizes
+            elif (
+                isinstance(value, list) and isinstance(value[0], list) and isinstance(value[0][0], torch.Tensor)
+            ):  # for minicpmv inputs
+                value = torch.stack([torch.stack(v) for v in value])
+            elif not isinstance(value, torch.Tensor):
+                value = torch.tensor(value)
+
+            if torch.is_floating_point(value):  # cast data dtype for paligemma
+                value = value.to(model.dtype)
+
+            if key == "second_per_grid_ts":  # qwen2.5vl special case
+                gen_kwargs[key] = value.tolist()
+            else:
+                gen_kwargs[key] = value.to(model.device)
+
+        if getattr(model.config, "model_type", None) in ["minicpmv", "minicpmo"]:
+            gen_kwargs["input_ids"] = inputs
+            gen_kwargs["tokenizer"] = tokenizer
+            if "audio_feature_lens" in mm_inputs:
+                gen_kwargs["audio_feature_lens"] = mm_inputs["audio_feature_lens"]
+
+            gen_kwargs.pop("image_sizes", None)
+
+        return gen_kwargs, prompt_length
+
+    @staticmethod
+    @torch.inference_mode()
+    def _chat(
+        model: "PreTrainedModel",
+        tokenizer: "PreTrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
+        template: "Template",
+        generating_args: dict[str, Any],
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        input_kwargs: Optional[dict[str, Any]] = {},
+    ) -> list["Response"]:
+        gen_kwargs, prompt_length = HuggingfaceEngine._process_args(
+            model,
+            tokenizer,
+            processor,
+            template,
+            generating_args,
+            messages,
+            system,
+            tools,
+            images,
+            videos,
+            audios,
+            input_kwargs,
+        )
+        generate_output = model.generate(**gen_kwargs)
+        if isinstance(generate_output, tuple):
+            generate_output = generate_output[1][0]  # post-process the minicpm_o output
+
+        response_ids = generate_output[:, prompt_length:]
+        response = tokenizer.batch_decode(
+            response_ids,
+            skip_special_tokens=getattr(gen_kwargs["generation_config"], "skip_special_tokens", True),
+            clean_up_tokenization_spaces=True,
+        )
+        results = []
+        for i in range(len(response)):
+            eos_index = (response_ids[i] == tokenizer.eos_token_id).nonzero()
+            response_length = (eos_index[0].item() + 1) if len(eos_index) else len(response_ids[i])
+            results.append(
+                Response(
+                    response_text=response[i],
+                    response_length=response_length,
+                    prompt_length=prompt_length,
+                    finish_reason="stop" if len(eos_index) else "length",
+                )
+            )
+
+        return results
+
+    @staticmethod
+    @torch.inference_mode()
+    def _stream_chat(
+        model: "PreTrainedModel",
+        tokenizer: "PreTrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
+        template: "Template",
+        generating_args: dict[str, Any],
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        input_kwargs: Optional[dict[str, Any]] = {},
+    ) -> Callable[[], str]:
+        gen_kwargs, _ = HuggingfaceEngine._process_args(
+            model,
+            tokenizer,
+            processor,
+            template,
+            generating_args,
+            messages,
+            system,
+            tools,
+            images,
+            videos,
+            audios,
+            input_kwargs,
+        )
+        streamer = TextIteratorStreamer(
+            tokenizer,
+            skip_prompt=True,
+            skip_special_tokens=getattr(gen_kwargs["generation_config"], "skip_special_tokens", True),
+        )
+        gen_kwargs["streamer"] = streamer
+        thread = Thread(target=model.generate, kwargs=gen_kwargs, daemon=True)
+        thread.start()
+
+        def stream():
+            try:
+                return streamer.__next__()
+            except StopIteration:
+                raise StopAsyncIteration()
+
+        return stream
+
+    @staticmethod
+    @torch.inference_mode()
+    def _get_scores(
+        model: "PreTrainedModelWrapper",
+        tokenizer: "PreTrainedTokenizer",
+        batch_input: list[str],
+        input_kwargs: Optional[dict[str, Any]] = {},
+    ) -> list[float]:
+        max_length: Optional[int] = input_kwargs.pop("max_length", None)
+        device = getattr(model.pretrained_model, "device", "cuda")
+        inputs: dict[str, torch.Tensor] = tokenizer(
+            batch_input,
+            padding=True,
+            truncation=True,
+            max_length=max_length or getattr(model.config, "max_position_embeddings", 1024),
+            return_tensors="pt",
+            add_special_tokens=False,
+        ).to(device)
+        values: torch.Tensor = model(**inputs, return_dict=True, use_cache=False)[-1]
+        scores = values.gather(dim=-1, index=(inputs["attention_mask"].sum(dim=-1, keepdim=True) - 1))
+        return scores
+
+    @override
+    async def chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> list["Response"]:
+        if not self.can_generate:
+            raise ValueError("The current model does not support `chat`.")
+
+        input_args = (
+            self.model,
+            self.tokenizer,
+            self.processor,
+            self.template,
+            self.generating_args,
+            messages,
+            system,
+            tools,
+            images,
+            videos,
+            audios,
+            input_kwargs,
+        )
+        async with self.semaphore:
+            return await asyncio.to_thread(self._chat, *input_args)
+
+    @override
+    async def stream_chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncGenerator[str, None]:
+        if not self.can_generate:
+            raise ValueError("The current model does not support `stream_chat`.")
+
+        input_args = (
+            self.model,
+            self.tokenizer,
+            self.processor,
+            self.template,
+            self.generating_args,
+            messages,
+            system,
+            tools,
+            images,
+            videos,
+            audios,
+            input_kwargs,
+        )
+        async with self.semaphore:
+            stream = self._stream_chat(*input_args)
+            while True:
+                try:
+                    yield await asyncio.to_thread(stream)
+                except StopAsyncIteration:
+                    break
+
+    @override
+    async def get_scores(
+        self,
+        batch_input: list[str],
+        **input_kwargs,
+    ) -> list[float]:
+        if self.can_generate:
+            raise ValueError("Cannot get scores using an auto-regressive model.")
+
+        input_args = (self.model, self.tokenizer, batch_input, input_kwargs)
+        async with self.semaphore:
+            return await asyncio.to_thread(self._get_scores, *input_args)
diff --git a/LlamaFactory/src/llamafactory/chat/kt_engine.py b/LlamaFactory/src/llamafactory/chat/kt_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bf3f4bb2b685ee971d538d29f0b6afa16956f2c
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/chat/kt_engine.py
@@ -0,0 +1,284 @@
+# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+import platform
+from collections.abc import AsyncGenerator
+from threading import Thread
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from typing_extensions import override
+
+from ..data import get_template_and_fix_tokenizer
+from ..extras import logging
+from ..extras.constants import EngineName
+from ..model import load_model, load_tokenizer
+from .base_engine import BaseEngine, Response
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+    from trl import PreTrainedModelWrapper
+
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
+    from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
+from ktransformers.server.config.config import Config
+from ktransformers.util.utils import (
+    get_compute_capability,
+    prefill_and_generate_capture,
+)
+from ktransformers.util.vendors import GPUVendor, device_manager
+
+
+logger = logging.get_logger(__name__)
+
+
+class KTransformersEngine(BaseEngine):
+    def __init__(
+        self,
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+    ) -> None:
+        self.name = EngineName.KT
+        self.can_generate = finetuning_args.stage == "sft"
+
+        tok_mod = load_tokenizer(model_args)
+        self.tokenizer = tok_mod["tokenizer"]
+        self.tokenizer.padding_side = "left" if self.can_generate else "right"
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args)
+
+        self.model = load_model(
+            self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
+        )
+
+        self.generating_args = generating_args.to_dict()
+        self.max_new_tokens = model_args.kt_maxlen
+        self.use_cuda_graph = model_args.kt_use_cuda_graph
+        self.mode = model_args.kt_mode
+        self.force_think = model_args.kt_force_think
+        self.chunk_size = model_args.chunk_size
+
+        try:
+            asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
+        self.semaphore = asyncio.Semaphore(int(os.getenv("MAX_CONCURRENT", "1")))
+
+    @staticmethod
+    @torch.inference_mode()
+    def _get_scores(
+        model: "PreTrainedModelWrapper",
+        tokenizer: "PreTrainedTokenizer",
+        batch_input: list[str],
+        input_kwargs: Optional[dict[str, Any]] = {},
+    ) -> list[float]:
+        max_length: Optional[int] = input_kwargs.pop("max_length", None)
+        device = getattr(model.pretrained_model, "device", "cuda")
+        inputs = tokenizer(
+            batch_input,
+            padding=True,
+            truncation=True,
+            max_length=max_length or getattr(model.config, "max_position_embeddings", 1024),
+            return_tensors="pt",
+            add_special_tokens=False,
+        ).to(device)
+        values: torch.Tensor = model(**inputs, return_dict=True, use_cache=False)[-1]
+        scores = values.gather(dim=-1, index=(inputs["attention_mask"].sum(dim=-1, keepdim=True) - 1))
+        return scores
+
+    async def _generate(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        **input_kwargs,
+    ) -> AsyncGenerator[str, None]:
+        paired = messages + [{"role": "assistant", "content": ""}]
+        prompt_ids, _ = self.template.encode_oneturn(self.tokenizer, paired, system, tools)
+        prompt_len = len(prompt_ids)
+
+        max_length: Optional[int] = input_kwargs.pop("max_length", None)
+        max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None)
+
+        if "max_new_tokens" in self.generating_args:
+            max_tokens = int(self.generating_args["max_new_tokens"])
+        elif "max_length" in self.generating_args:
+            gl = int(self.generating_args["max_length"])
+            max_tokens = gl - prompt_len if gl > prompt_len else 1
+        else:
+            max_tokens = self.max_new_tokens or 256
+
+        if max_length is not None:
+            max_tokens = max(max_length - prompt_len, 1)
+        if max_new_tokens is not None:
+            max_tokens = int(max_new_tokens)
+        max_tokens = max(1, int(max_tokens))
+
+        if self.mode == "long_context":
+            max_len_cfg = Config().long_context_config["max_seq_len"]
+            need = prompt_len + max_tokens
+            assert max_len_cfg > need, f"please set max_seq_len > {need} in ~/.ktransformers/config.yaml"
+
+        device = next(self.model.parameters()).device
+        input_tensor = torch.tensor([prompt_ids], dtype=torch.long, device=device)
+        if self.force_think:
+            think = torch.tensor(
+                [self.tokenizer.encode("<think>\n", add_special_tokens=False)], dtype=torch.long, device=device
+            )
+            input_tensor = torch.cat([input_tensor, think], dim=1)
+
+        use_flashinfer = (
+            platform.system() != "Windows"
+            and getattr(self.model.config, "architectures", [""])[0]
+            in {"DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"}
+            and flashinfer_enabled
+            and get_compute_capability() >= 8
+            and device_manager.gpu_vendor == GPUVendor.NVIDIA
+        )
+
+        def make_gen():
+            if use_flashinfer:
+                return prefill_and_generate_capture(
+                    self.model,
+                    self.tokenizer,
+                    input_tensor,
+                    max_tokens,
+                    self.use_cuda_graph,
+                    mode=self.mode,
+                    force_think=self.force_think,
+                    chunk_size=self.chunk_size,
+                    use_flashinfer_mla=True,
+                    num_heads=self.model.config.num_attention_heads,
+                    head_dim_ckv=getattr(self.model.config, "kv_lora_rank", 0),
+                    head_dim_kpe=getattr(self.model.config, "qk_rope_head_dim", 0),
+                    q_head_dim=getattr(self.model.config, "qk_rope_head_dim", 0)
+                    + getattr(self.model.config, "qk_nope_head_dim", 0),
+                    echo_stream=False,
+                )
+            else:
+                return prefill_and_generate_capture(
+                    self.model,
+                    self.tokenizer,
+                    input_tensor,
+                    max_tokens,
+                    self.use_cuda_graph,
+                    mode=self.mode,
+                    force_think=self.force_think,
+                    chunk_size=self.chunk_size,
+                    echo_stream=False,
+                )
+
+        loop = asyncio.get_running_loop()
+        q: asyncio.Queue[Optional[str]] = asyncio.Queue()
+
+        def producer():
+            try:
+                gen = make_gen()
+                if hasattr(gen, "__aiter__"):
+
+                    async def drain_async():
+                        async for t in gen:
+                            loop.call_soon_threadsafe(q.put_nowait, t if isinstance(t, str) else str(t))
+
+                    asyncio.run(drain_async())
+                elif hasattr(gen, "__iter__"):
+                    for t in gen:
+                        loop.call_soon_threadsafe(q.put_nowait, t if isinstance(t, str) else str(t))
+                else:
+                    loop.call_soon_threadsafe(q.put_nowait, gen if isinstance(gen, str) else str(gen))
+            finally:
+                loop.call_soon_threadsafe(q.put_nowait, None)
+
+        Thread(target=producer, daemon=True).start()
+
+        while True:
+            item = await q.get()
+            if item is None:
+                break
+            yield item
+
+    @override
+    async def chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> list["Response"]:
+        if not self.can_generate:
+            raise ValueError("The current model does not support `chat`.")
+        async with self.semaphore:
+            produced = ""
+            final_text = ""
+            async for t in self._generate(messages, system, tools, **input_kwargs):
+                delta = t
+                produced = produced + delta
+                if delta:
+                    final_text += delta
+
+            prompt_ids, _ = self.template.encode_oneturn(
+                self.tokenizer, messages + [{"role": "assistant", "content": ""}], system, tools
+            )
+            return [
+                Response(
+                    response_text=final_text,
+                    response_length=len(self.tokenizer.encode(final_text, add_special_tokens=False)),
+                    prompt_length=len(prompt_ids),
+                    finish_reason="stop",
+                )
+            ]
+
+    @override
+    async def stream_chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncGenerator[str, None]:
+        if not self.can_generate:
+            raise ValueError("The current model does not support `stream_chat`.")
+        async with self.semaphore:
+            produced = ""
+            async for t in self._generate(messages, system, tools, **input_kwargs):
+                delta = t[len(produced) :] if t.startswith(produced) else t
+                produced = t
+                if delta:
+                    yield delta
+
+    @override
+    async def get_scores(
+        self,
+        batch_input: list[str],
+        **input_kwargs,
+    ) -> list[float]:
+        if self.can_generate:
+            raise ValueError("Cannot get scores using an auto-regressive model.")
+        args = (self.model, self.tokenizer, batch_input, input_kwargs)
+        async with self.semaphore:
+            return await asyncio.to_thread(self._get_scores, *args)
diff --git a/LlamaFactory/src/llamafactory/chat/sglang_engine.py b/LlamaFactory/src/llamafactory/chat/sglang_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1d2ead33823bc70d51cda59750d25580f972083
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/chat/sglang_engine.py
@@ -0,0 +1,289 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import atexit
+import json
+from collections.abc import AsyncGenerator, AsyncIterator, Sequence
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import requests
+from typing_extensions import override
+
+from ..data import get_template_and_fix_tokenizer
+from ..extras import logging
+from ..extras.constants import AUDIO_PLACEHOLDER, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER, EngineName
+from ..extras.misc import get_device_count, torch_gc
+from ..extras.packages import is_sglang_available
+from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+from ..model import load_config, load_tokenizer
+from ..model.model_utils.quantization import QuantizationMethod
+from .base_engine import BaseEngine, Response
+
+
+if is_sglang_available():
+    from sglang.utils import launch_server_cmd, terminate_process, wait_for_server  # type: ignore
+
+
+if TYPE_CHECKING:
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
+
+
+logger = logging.get_logger(__name__)
+
+
+class SGLangEngine(BaseEngine):
+    """Inference engine for SGLang models.
+
+    This class wraps the SGLang engine to provide a consistent interface for text generation
+    that matches LLaMA Factory's requirements. It uses the SGLang HTTP server approach for
+    better interaction and performance. The engine launches a server process and communicates
+    with it via HTTP requests.
+
+    For more details on the SGLang HTTP server approach, see:
+    https://docs.sglang.ai/backend/send_request.html
+    """
+
+    def __init__(
+        self,
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+    ) -> None:
+        self.name = EngineName.SGLANG
+        self.model_args = model_args
+        config = load_config(model_args)  # may download model from ms hub
+        if getattr(config, "quantization_config", None):  # gptq models should use float16
+            quantization_config: dict[str, Any] = getattr(config, "quantization_config", None)
+            quant_method = quantization_config.get("quant_method", "")
+            if quant_method == QuantizationMethod.GPTQ and model_args.infer_dtype == "auto":
+                model_args.infer_dtype = "float16"
+
+        self.can_generate = finetuning_args.stage == "sft"
+        tokenizer_module = load_tokenizer(model_args)
+        self.tokenizer = tokenizer_module["tokenizer"]
+        self.processor = tokenizer_module["processor"]
+        self.tokenizer.padding_side = "left"
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args)
+        self.template.mm_plugin.expand_mm_tokens = False  # for sglang generate
+        self.generating_args = generating_args.to_dict()
+        if model_args.adapter_name_or_path is not None:
+            self.lora_request = True
+        else:
+            self.lora_request = False
+
+        launch_cmd = [
+            "python3 -m sglang.launch_server",
+            f"--model-path {model_args.model_name_or_path}",
+            f"--dtype {model_args.infer_dtype}",
+            f"--context-length {model_args.sglang_maxlen}",
+            f"--mem-fraction-static {model_args.sglang_mem_fraction}",
+            f"--tp-size {model_args.sglang_tp_size if model_args.sglang_tp_size != -1 else get_device_count() or 1}",
+            f"--download-dir {model_args.cache_dir}",
+            "--log-level error",
+        ]
+        if self.lora_request:
+            launch_cmd.extend(
+                [
+                    "--max-loras-per-batch 1",
+                    f"--lora-backend {model_args.sglang_lora_backend}",
+                    f"--lora-paths lora0={model_args.adapter_name_or_path[0]}",
+                    "--disable-radix-cache",
+                ]
+            )
+        launch_cmd = " ".join(launch_cmd)
+        logger.info_rank0(f"Starting SGLang server with command: {launch_cmd}")
+        try:
+            torch_gc()
+            self.server_process, port = launch_server_cmd(launch_cmd)
+            self.base_url = f"http://localhost:{port}"
+            atexit.register(self._cleanup_server)
+
+            logger.info_rank0(f"Waiting for SGLang server to be ready at {self.base_url}")
+            wait_for_server(self.base_url, timeout=300)
+            logger.info_rank0(f"SGLang server initialized successfully at {self.base_url}")
+            try:
+                response = requests.get(f"{self.base_url}/get_model_info", timeout=5)
+                if response.status_code == 200:
+                    model_info = response.json()
+                    logger.info(f"SGLang server model info: {model_info}")
+            except Exception as e:
+                logger.debug(f"Note: could not get model info: {str(e)}")
+
+        except Exception as e:
+            logger.error(f"Failed to start SGLang server: {str(e)}")
+            self._cleanup_server()  # make sure to clean up any started process
+            raise RuntimeError(f"SGLang server initialization failed: {str(e)}.")
+
+    def _cleanup_server(self):
+        r"""Clean up the server process when the engine is destroyed."""
+        if hasattr(self, "server_process") and self.server_process:
+            try:
+                logger.info("Terminating SGLang server process")
+                terminate_process(self.server_process)
+                logger.info("SGLang server process terminated")
+            except Exception as e:
+                logger.warning(f"Error terminating SGLang server: {str(e)}")
+
+    async def _generate(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncIterator[dict[str, Any]]:
+        if images is not None and not any(IMAGE_PLACEHOLDER in message["content"] for message in messages):
+            messages[0]["content"] = IMAGE_PLACEHOLDER * len(images) + messages[0]["content"]
+
+        if videos is not None and not any(VIDEO_PLACEHOLDER in message["content"] for message in messages):
+            messages[0]["content"] = VIDEO_PLACEHOLDER * len(videos) + messages[0]["content"]
+
+        if audios is not None and not any(AUDIO_PLACEHOLDER in message["content"] for message in messages):
+            messages[0]["content"] = AUDIO_PLACEHOLDER * len(audios) + messages[0]["content"]
+
+        messages = self.template.mm_plugin.process_messages(
+            messages, images or [], videos or [], audios or [], self.processor
+        )
+        paired_messages = messages + [{"role": "assistant", "content": ""}]
+        prompt_ids, _ = self.template.encode_oneturn(self.tokenizer, paired_messages, system, tools)
+        prompt_length = len(prompt_ids)
+
+        temperature: Optional[float] = input_kwargs.pop("temperature", None)
+        top_p: Optional[float] = input_kwargs.pop("top_p", None)
+        top_k: Optional[float] = input_kwargs.pop("top_k", None)
+        num_return_sequences: int = input_kwargs.pop("num_return_sequences", 1)
+        repetition_penalty: Optional[float] = input_kwargs.pop("repetition_penalty", None)
+        skip_special_tokens: Optional[bool] = input_kwargs.pop("skip_special_tokens", None)
+        max_length: Optional[int] = input_kwargs.pop("max_length", None)
+        max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None)
+        stop: Optional[Union[str, list[str]]] = input_kwargs.pop("stop", None)
+
+        if num_return_sequences != 1:
+            raise NotImplementedError("SGLang only supports n=1.")
+
+        if "max_new_tokens" in self.generating_args:
+            max_tokens = self.generating_args["max_new_tokens"]
+        elif "max_length" in self.generating_args:
+            if self.generating_args["max_length"] > prompt_length:
+                max_tokens = self.generating_args["max_length"] - prompt_length
+            else:
+                max_tokens = 1
+
+        if max_length:
+            max_tokens = max_length - prompt_length if max_length > prompt_length else 1
+
+        if max_new_tokens:
+            max_tokens = max_new_tokens
+
+        sampling_params = {
+            "temperature": temperature if temperature is not None else self.generating_args["temperature"],
+            "top_p": (top_p if top_p is not None else self.generating_args["top_p"]) or 1.0,  # top_p must > 0
+            "top_k": (top_k if top_k is not None else self.generating_args["top_k"]) or -1,  # top_k must > 0
+            "stop": stop,
+            "stop_token_ids": self.template.get_stop_token_ids(self.tokenizer),
+            "max_new_tokens": max_tokens,
+            "repetition_penalty": (
+                repetition_penalty if repetition_penalty is not None else self.generating_args["repetition_penalty"]
+            )
+            or 1.0,  # repetition_penalty must > 0
+            "skip_special_tokens": skip_special_tokens
+            if skip_special_tokens is not None
+            else self.generating_args["skip_special_tokens"],
+        }
+
+        def stream_request():
+            json_data = {
+                "input_ids": prompt_ids,
+                "sampling_params": sampling_params,
+                "stream": True,
+            }
+            if self.lora_request:
+                json_data["lora_request"] = ["lora0"]
+            response = requests.post(f"{self.base_url}/generate", json=json_data, stream=True)
+            if response.status_code != 200:
+                raise RuntimeError(f"SGLang server error: {response.status_code}, {response.text}")
+
+            for chunk in response.iter_lines(decode_unicode=False):
+                chunk = str(chunk.decode("utf-8"))
+                if chunk == "data: [DONE]":
+                    break
+
+                if chunk and chunk.startswith("data:"):
+                    yield json.loads(chunk[5:].strip("\n"))
+
+        return await asyncio.to_thread(stream_request)
+
+    @override
+    async def chat(
+        self,
+        messages: Sequence[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[Sequence["ImageInput"]] = None,
+        videos: Optional[Sequence["VideoInput"]] = None,
+        audios: Optional[Sequence["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> list["Response"]:
+        final_output = None
+        generator = await self._generate(messages, system, tools, images, videos, audios, **input_kwargs)
+        for request_output in generator:
+            final_output = request_output
+
+        results = [
+            Response(
+                response_text=final_output["text"],
+                response_length=final_output["meta_info"]["completion_tokens"],
+                prompt_length=final_output["meta_info"]["prompt_tokens"],
+                finish_reason="stop" if final_output["meta_info"]["finish_reason"] == "stop" else "length",
+            )
+        ]
+        return results
+
+    @override
+    async def stream_chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncGenerator[str, None]:
+        generated_text = ""
+        generator = await self._generate(messages, system, tools, images, videos, audios, **input_kwargs)
+        for result in generator:
+            delta_text = result["text"][len(generated_text) :]
+            generated_text = result["text"]
+            yield delta_text
+
+    @override
+    async def get_scores(
+        self,
+        batch_input: list[str],
+        **input_kwargs,
+    ) -> list[float]:
+        raise NotImplementedError("SGLang engine does not support `get_scores`.")
+
+    def __del__(self):
+        r"""Ensure server is cleaned up when object is deleted."""
+        self._cleanup_server()
+        try:
+            atexit.unregister(self._cleanup_server)
+        except Exception:
+            pass
diff --git a/LlamaFactory/src/llamafactory/chat/vllm_engine.py b/LlamaFactory/src/llamafactory/chat/vllm_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..075924a2fdb6c12c942b5704bc5ffd49d92808a4
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/chat/vllm_engine.py
@@ -0,0 +1,271 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import uuid
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+from packaging import version
+from typing_extensions import override
+
+from ..data import get_template_and_fix_tokenizer
+from ..extras import logging
+from ..extras.constants import AUDIO_PLACEHOLDER, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER, EngineName
+from ..extras.misc import get_device_count
+from ..extras.packages import is_vllm_available
+from ..model import load_config, load_tokenizer
+from ..model.model_utils.quantization import QuantizationMethod
+from ..model.model_utils.visual import LlavaMultiModalProjectorForYiVLForVLLM
+from .base_engine import BaseEngine, Response
+
+
+if is_vllm_available():
+    from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
+    from vllm.lora.request import LoRARequest
+
+
+if TYPE_CHECKING:
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
+    from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class VllmEngine(BaseEngine):
+    def __init__(
+        self,
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+    ) -> None:
+        self.name = EngineName.VLLM
+        self.model_args = model_args
+        config = load_config(model_args)  # may download model from ms hub
+        if getattr(config, "quantization_config", None):  # gptq models should use float16
+            quantization_config: dict[str, Any] = getattr(config, "quantization_config", None)
+            quant_method = quantization_config.get("quant_method", "")
+            if quant_method == QuantizationMethod.GPTQ and model_args.infer_dtype == "auto":
+                model_args.infer_dtype = "float16"
+
+        self.can_generate = finetuning_args.stage == "sft"
+        tokenizer_module = load_tokenizer(model_args)
+        self.tokenizer = tokenizer_module["tokenizer"]
+        self.processor = tokenizer_module["processor"]
+        self.tokenizer.padding_side = "left"
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args)
+        self.template.mm_plugin.expand_mm_tokens = False  # for vllm generate
+        self.generating_args = generating_args.to_dict()
+
+        engine_args = {
+            "model": model_args.model_name_or_path,
+            "trust_remote_code": model_args.trust_remote_code,
+            "download_dir": model_args.cache_dir,
+            "dtype": model_args.infer_dtype,
+            "max_model_len": model_args.vllm_maxlen,
+            "tensor_parallel_size": get_device_count() or 1,
+            "gpu_memory_utilization": model_args.vllm_gpu_util,
+            "disable_log_stats": True,
+            "enforce_eager": model_args.vllm_enforce_eager,
+            "enable_lora": model_args.adapter_name_or_path is not None,
+            "max_lora_rank": model_args.vllm_max_lora_rank,
+        }
+
+        import vllm
+
+        if version.parse(vllm.__version__) <= version.parse("0.10.0"):
+            engine_args["disable_log_requests"] = True
+        else:
+            engine_args["enable_log_requests"] = False
+
+        if self.template.mm_plugin.__class__.__name__ != "BasePlugin":
+            engine_args["limit_mm_per_prompt"] = {"image": 4, "video": 2, "audio": 2}
+
+        if isinstance(model_args.vllm_config, dict):
+            engine_args.update(model_args.vllm_config)
+
+        if getattr(config, "is_yi_vl_derived_model", None):
+            import vllm.model_executor.models.llava
+
+            logger.info_rank0("Detected Yi-VL model, applying projector patch.")
+            vllm.model_executor.models.llava.LlavaMultiModalProjector = LlavaMultiModalProjectorForYiVLForVLLM
+
+        self.model = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**engine_args))
+        if model_args.adapter_name_or_path is not None:
+            self.lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0])
+        else:
+            self.lora_request = None
+
+    async def _generate(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncIterator["RequestOutput"]:
+        request_id = f"chatcmpl-{uuid.uuid4().hex}"
+        if images is not None and not any(IMAGE_PLACEHOLDER in message["content"] for message in messages):
+            messages[0]["content"] = IMAGE_PLACEHOLDER * len(images) + messages[0]["content"]
+
+        if videos is not None and not any(VIDEO_PLACEHOLDER in message["content"] for message in messages):
+            messages[0]["content"] = VIDEO_PLACEHOLDER * len(videos) + messages[0]["content"]
+
+        if audios is not None and not any(AUDIO_PLACEHOLDER in message["content"] for message in messages):
+            messages[0]["content"] = AUDIO_PLACEHOLDER * len(audios) + messages[0]["content"]
+
+        messages = self.template.mm_plugin.process_messages(
+            messages, images or [], videos or [], audios or [], self.processor
+        )
+        paired_messages = messages + [{"role": "assistant", "content": ""}]
+        prompt_ids, _ = self.template.encode_oneturn(self.tokenizer, paired_messages, system, tools)
+        prompt_length = len(prompt_ids)
+
+        temperature: Optional[float] = input_kwargs.pop("temperature", None)
+        top_p: Optional[float] = input_kwargs.pop("top_p", None)
+        top_k: Optional[float] = input_kwargs.pop("top_k", None)
+        num_return_sequences: int = input_kwargs.pop("num_return_sequences", 1)
+        repetition_penalty: Optional[float] = input_kwargs.pop("repetition_penalty", None)
+        length_penalty: Optional[float] = input_kwargs.pop("length_penalty", None)
+        skip_special_tokens: Optional[bool] = input_kwargs.pop("skip_special_tokens", None)
+        max_length: Optional[int] = input_kwargs.pop("max_length", None)
+        max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None)
+        stop: Optional[Union[str, list[str]]] = input_kwargs.pop("stop", None)
+
+        if length_penalty is not None:
+            logger.warning_rank0("Length penalty is not supported by the vllm engine yet.")
+
+        if "max_new_tokens" in self.generating_args:
+            max_tokens = self.generating_args["max_new_tokens"]
+        elif "max_length" in self.generating_args:
+            if self.generating_args["max_length"] > prompt_length:
+                max_tokens = self.generating_args["max_length"] - prompt_length
+            else:
+                max_tokens = 1
+
+        if max_length:
+            max_tokens = max_length - prompt_length if max_length > prompt_length else 1
+
+        if max_new_tokens:
+            max_tokens = max_new_tokens
+
+        sampling_params = SamplingParams(
+            n=num_return_sequences,
+            repetition_penalty=(
+                repetition_penalty if repetition_penalty is not None else self.generating_args["repetition_penalty"]
+            )
+            or 1.0,  # repetition_penalty must > 0
+            temperature=temperature if temperature is not None else self.generating_args["temperature"],
+            top_p=(top_p if top_p is not None else self.generating_args["top_p"]) or 1.0,  # top_p must > 0
+            top_k=(top_k if top_k is not None else self.generating_args["top_k"]) or -1,  # top_k must > 0
+            stop=stop,
+            stop_token_ids=self.template.get_stop_token_ids(self.tokenizer),
+            max_tokens=max_tokens,
+            skip_special_tokens=skip_special_tokens
+            if skip_special_tokens is not None
+            else self.generating_args["skip_special_tokens"],
+        )
+
+        if images is not None:  # add image features
+            multi_modal_data = {
+                "image": self.template.mm_plugin._regularize_images(
+                    images,
+                    image_max_pixels=self.model_args.image_max_pixels,
+                    image_min_pixels=self.model_args.image_min_pixels,
+                )["images"]
+            }
+        elif videos is not None:
+            multi_modal_data = {
+                "video": self.template.mm_plugin._regularize_videos(
+                    videos,
+                    image_max_pixels=self.model_args.video_max_pixels,
+                    image_min_pixels=self.model_args.video_min_pixels,
+                    video_fps=self.model_args.video_fps,
+                    video_maxlen=self.model_args.video_maxlen,
+                )["videos"]
+            }
+        elif audios is not None:
+            audio_data = self.template.mm_plugin._regularize_audios(
+                audios,
+                sampling_rate=self.model_args.audio_sampling_rate,
+            )
+            multi_modal_data = {"audio": zip(audio_data["audios"], audio_data["sampling_rates"])}
+        else:
+            multi_modal_data = None
+
+        result_generator = self.model.generate(
+            {"prompt_token_ids": prompt_ids, "multi_modal_data": multi_modal_data},
+            sampling_params=sampling_params,
+            request_id=request_id,
+            lora_request=self.lora_request,
+        )
+        return result_generator
+
+    @override
+    async def chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> list["Response"]:
+        final_output = None
+        generator = await self._generate(messages, system, tools, images, videos, audios, **input_kwargs)
+        async for request_output in generator:
+            final_output = request_output
+
+        results = []
+        for output in final_output.outputs:
+            results.append(
+                Response(
+                    response_text=output.text,
+                    response_length=len(output.token_ids),
+                    prompt_length=len(final_output.prompt_token_ids),
+                    finish_reason=output.finish_reason,
+                )
+            )
+
+        return results
+
+    @override
+    async def stream_chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncGenerator[str, None]:
+        generated_text = ""
+        generator = await self._generate(messages, system, tools, images, videos, audios, **input_kwargs)
+        async for result in generator:
+            delta_text = result.outputs[0].text[len(generated_text) :]
+            generated_text = result.outputs[0].text
+            yield delta_text
+
+    @override
+    async def get_scores(
+        self,
+        batch_input: list[str],
+        **input_kwargs,
+    ) -> list[float]:
+        raise NotImplementedError("vLLM engine does not support `get_scores`.")
diff --git a/LlamaFactory/src/llamafactory/cli.py b/LlamaFactory/src/llamafactory/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..d574bf1db543f5379f074e276898826234708037
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/cli.py
@@ -0,0 +1,31 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def main():
+    from .extras.misc import is_env_enabled
+
+    if is_env_enabled("USE_V1"):
+        from .v1 import launcher
+    else:
+        from . import launcher
+
+    launcher.launch()
+
+
+if __name__ == "__main__":
+    from multiprocessing import freeze_support
+
+    freeze_support()
+    main()
diff --git a/LlamaFactory/src/llamafactory/data/.ipynb_checkpoints/template-checkpoint.py b/LlamaFactory/src/llamafactory/data/.ipynb_checkpoints/template-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e83b2e90a60c439370ed5cad4a13846abc977bf5
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/.ipynb_checkpoints/template-checkpoint.py
@@ -0,0 +1,2175 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union
+
+from typing_extensions import override
+
+from ..extras import logging
+from .data_utils import Role
+from .formatter import EmptyFormatter, FunctionFormatter, StringFormatter, ToolFormatter
+from .mm_plugin import get_mm_plugin
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+    from ..hparams import DataArguments
+    from .formatter import SLOTS, Formatter
+    from .mm_plugin import BasePlugin
+    from .tool_utils import FunctionCall
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class Template:
+    format_user: "Formatter"
+    format_assistant: "Formatter"
+    format_system: "Formatter"
+    format_function: "Formatter"
+    format_observation: "Formatter"
+    format_tools: "Formatter"
+    format_prefix: "Formatter"
+    default_system: str
+    stop_words: list[str]
+    thought_words: tuple[str, str]
+    tool_call_words: tuple[str, str]
+    efficient_eos: bool
+    replace_eos: bool
+    replace_jinja_template: bool
+    enable_thinking: Optional[bool]
+    mm_plugin: "BasePlugin"
+
+    def encode_oneturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> tuple[list[int], list[int]]:
+        r"""Return a single pair of token ids representing prompt and response respectively."""
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        prompt_ids = []
+        for encoded_ids in encoded_messages[:-1]:
+            prompt_ids += encoded_ids
+
+        response_ids = encoded_messages[-1]
+        return prompt_ids, response_ids
+
+    def encode_multiturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> list[tuple[list[int], list[int]]]:
+        r"""Return multiple pairs of token ids representing prompts and responses respectively."""
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        return [(encoded_messages[i], encoded_messages[i + 1]) for i in range(0, len(encoded_messages), 2)]
+
+    def extract_tool(self, content: str) -> Union[str, list["FunctionCall"]]:
+        r"""Extract tool message."""
+        return self.format_tools.extract(content)
+
+    def get_stop_token_ids(self, tokenizer: "PreTrainedTokenizer") -> list[int]:
+        r"""Return stop token ids."""
+        stop_token_ids = {tokenizer.eos_token_id}
+        for token in self.stop_words:
+            stop_token_ids.add(tokenizer.convert_tokens_to_ids(token))
+
+        return list(stop_token_ids)
+
+    def add_thought(self, content: str = "") -> str:
+        r"""Add empty thought to assistant message."""
+        return f"{self.thought_words[0]}{self.thought_words[1]}" + content
+
+    def remove_thought(self, content: str) -> str:
+        r"""Remove thought from assistant message."""
+        pattern = re.compile(f"{re.escape(self.thought_words[0])}(.*?){re.escape(self.thought_words[1])}", re.DOTALL)
+        return re.sub(pattern, "", content).lstrip("\n")
+
+    def get_thought_word_ids(self, tokenizer: "PreTrainedTokenizer") -> list[int]:
+        r"""Get the token ids of thought words."""
+        return tokenizer.encode(self.add_thought(), add_special_tokens=False)
+
+    def _convert_elements_to_ids(self, tokenizer: "PreTrainedTokenizer", elements: "SLOTS") -> list[int]:
+        r"""Convert elements to token ids."""
+        token_ids = []
+        for elem in elements:
+            if isinstance(elem, str):
+                if len(elem) != 0:
+                    token_ids += tokenizer.encode(elem, add_special_tokens=False)
+            elif isinstance(elem, dict):
+                token_ids += [tokenizer.convert_tokens_to_ids(elem.get("token"))]
+            elif isinstance(elem, set):
+                if "bos_token" in elem and tokenizer.bos_token_id is not None:
+                    token_ids += [tokenizer.bos_token_id]
+                elif "eos_token" in elem and tokenizer.eos_token_id is not None:
+                    token_ids += [tokenizer.eos_token_id]
+            else:
+                raise ValueError(f"Input must be string, set[str] or dict[str, str], got {type(elem)}")
+
+        return token_ids
+
+    def _encode(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str],
+        tools: Optional[str],
+    ) -> list[list[int]]:
+        r"""Encode formatted inputs to pairs of token ids.
+
+        Turn 0: prefix + system + query        resp
+        Turn t: query                          resp.
+        """
+        system = system or self.default_system
+        encoded_messages = []
+        for i, message in enumerate(messages):
+            elements = []
+
+            if i == 0:
+                elements += self.format_prefix.apply()
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    elements += self.format_system.apply(content=(system + tool_text))
+
+            if message["role"] == Role.USER:
+                elements += self.format_user.apply(content=message["content"], idx=str(i // 2))
+            elif message["role"] == Role.ASSISTANT:
+                elements += self.format_assistant.apply(content=message["content"])
+            elif message["role"] == Role.OBSERVATION:
+                elements += self.format_observation.apply(content=message["content"])
+            elif message["role"] == Role.FUNCTION:
+                elements += self.format_function.apply(
+                    content=message["content"], thought_words=self.thought_words, tool_call_words=self.tool_call_words
+                )
+            else:
+                raise NotImplementedError("Unexpected role: {}".format(message["role"]))
+
+            encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
+
+        return encoded_messages
+
+    @staticmethod
+    def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_token: str) -> None:
+        r"""Add or replace eos token to the tokenizer."""
+        if tokenizer.eos_token == eos_token:
+            return
+
+        is_added = tokenizer.eos_token_id is None
+        num_added_tokens = tokenizer.add_special_tokens({"eos_token": eos_token})
+
+        if is_added:
+            logger.info_rank0(f"Add eos token: {tokenizer.eos_token}.")
+        else:
+            logger.info_rank0(f"Replace eos token: {tokenizer.eos_token}.")
+
+        if num_added_tokens > 0:
+            logger.warning_rank0("New tokens have been added, make sure `resize_vocab` is True.")
+
+    def fix_special_tokens(self, tokenizer: "PreTrainedTokenizer") -> None:
+        r"""Add eos token and pad token to the tokenizer."""
+        stop_words = self.stop_words
+        if self.replace_eos:
+            if not stop_words:
+                raise ValueError("Stop words are required to replace the EOS token.")
+
+            self._add_or_replace_eos_token(tokenizer, eos_token=stop_words[0])
+            stop_words = stop_words[1:]
+
+        if tokenizer.eos_token_id is None:
+            self._add_or_replace_eos_token(tokenizer, eos_token="<|endoftext|>")
+
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token = tokenizer.eos_token
+            logger.info_rank0(f"Add pad token: {tokenizer.pad_token}")
+
+        if stop_words:
+            try:
+                num_added_tokens = tokenizer.add_special_tokens(
+                    dict(additional_special_tokens=stop_words), replace_additional_special_tokens=False
+                )
+            except TypeError:
+                num_added_tokens = tokenizer.add_special_tokens(dict(additional_special_tokens=stop_words))
+            logger.info_rank0("Add {} to stop words.".format(",".join(stop_words)))
+            if num_added_tokens > 0:
+                logger.warning_rank0("New tokens have been added, make sure `resize_vocab` is True.")
+
+    @staticmethod
+    def _jinja_escape(content: str) -> str:
+        r"""Escape single quotes in content."""
+        return content.replace("'", r"\'")
+
+    @staticmethod
+    def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content") -> str:
+        r"""Convert slots to jinja template."""
+        slot_items = []
+        for slot in slots:
+            if isinstance(slot, str):
+                slot_pieces = slot.split("{{content}}")
+                if slot_pieces[0]:
+                    slot_items.append("'" + Template._jinja_escape(slot_pieces[0]) + "'")
+                if len(slot_pieces) > 1:
+                    slot_items.append(placeholder)
+                    if slot_pieces[1]:
+                        slot_items.append("'" + Template._jinja_escape(slot_pieces[1]) + "'")
+            elif isinstance(slot, set):  # do not use {{ eos_token }} since it may be replaced
+                if "bos_token" in slot and tokenizer.bos_token_id is not None:
+                    slot_items.append("'" + tokenizer.bos_token + "'")
+                elif "eos_token" in slot and tokenizer.eos_token_id is not None:
+                    slot_items.append("'" + tokenizer.eos_token + "'")
+            elif isinstance(slot, dict):
+                raise ValueError("Dict is not supported.")
+
+        return " + ".join(slot_items)
+
+    def _get_jinja_template(self, tokenizer: "PreTrainedTokenizer") -> str:
+        r"""Return the jinja template."""
+        prefix = self._convert_slots_to_jinja(self.format_prefix.apply(), tokenizer)
+        system = self._convert_slots_to_jinja(self.format_system.apply(), tokenizer, placeholder="system_message")
+        user = self._convert_slots_to_jinja(self.format_user.apply(), tokenizer)
+        assistant = self._convert_slots_to_jinja(self.format_assistant.apply(), tokenizer)
+        jinja_template = ""
+        if prefix:
+            jinja_template += "{{ " + prefix + " }}"
+
+        if self.default_system:
+            jinja_template += "{% set system_message = '" + self._jinja_escape(self.default_system) + "' %}"
+
+        jinja_template += (
+            "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}"
+            "{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}"
+            "{% if system_message is defined %}{{ " + system + " }}{% endif %}"
+            "{% for message in loop_messages %}"
+            "{% set content = message['content'] %}"
+            "{% if message['role'] == 'user' %}"
+            "{{ " + user + " }}"
+            "{% elif message['role'] == 'assistant' %}"
+            "{{ " + assistant + " }}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        return jinja_template
+
+    def fix_jinja_template(self, tokenizer: "PreTrainedTokenizer") -> None:
+        r"""Replace the jinja template in the tokenizer."""
+        if tokenizer.chat_template is None or self.replace_jinja_template:
+            try:
+                tokenizer.chat_template = self._get_jinja_template(tokenizer)
+            except ValueError as e:
+                logger.info_rank0(f"Cannot add this chat template to tokenizer: {e}.")
+
+    @staticmethod
+    def _convert_slots_to_ollama(
+        slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content"
+    ) -> str:
+        r"""Convert slots to ollama template."""
+        slot_items = []
+        for slot in slots:
+            if isinstance(slot, str):
+                slot_pieces = slot.split("{{content}}")
+                if slot_pieces[0]:
+                    slot_items.append(slot_pieces[0])
+                if len(slot_pieces) > 1:
+                    slot_items.append("{{ " + placeholder + " }}")
+                    if slot_pieces[1]:
+                        slot_items.append(slot_pieces[1])
+            elif isinstance(slot, set):  # do not use {{ eos_token }} since it may be replaced
+                if "bos_token" in slot and tokenizer.bos_token_id is not None:
+                    slot_items.append(tokenizer.bos_token)
+                elif "eos_token" in slot and tokenizer.eos_token_id is not None:
+                    slot_items.append(tokenizer.eos_token)
+            elif isinstance(slot, dict):
+                raise ValueError("Dict is not supported.")
+
+        return "".join(slot_items)
+
+    def _get_ollama_template(self, tokenizer: "PreTrainedTokenizer") -> str:
+        r"""Return the ollama template."""
+        prefix = self._convert_slots_to_ollama(self.format_prefix.apply(), tokenizer)
+        system = self._convert_slots_to_ollama(self.format_system.apply(), tokenizer, placeholder=".System")
+        user = self._convert_slots_to_ollama(self.format_user.apply(), tokenizer, placeholder=".Content")
+        assistant = self._convert_slots_to_ollama(self.format_assistant.apply(), tokenizer, placeholder=".Content")
+        return (
+            f"{prefix}{{{{ if .System }}}}{system}{{{{ end }}}}"
+            f"""{{{{ range .Messages }}}}{{{{ if eq .Role "user" }}}}{user}"""
+            f"""{{{{ else if eq .Role "assistant" }}}}{assistant}{{{{ end }}}}{{{{ end }}}}"""
+        )
+
+    def get_ollama_modelfile(self, tokenizer: "PreTrainedTokenizer") -> str:
+        r"""Return the ollama modelfile.
+
+        TODO: support function calling.
+        """
+        modelfile = "# ollama modelfile auto-generated by llamafactory\n\n"
+        modelfile += f'FROM .\n\nTEMPLATE """{self._get_ollama_template(tokenizer)}"""\n\n'
+
+        if self.default_system:
+            modelfile += f'SYSTEM """{self.default_system}"""\n\n'
+
+        for stop_token_id in self.get_stop_token_ids(tokenizer):
+            modelfile += f'PARAMETER stop "{tokenizer.convert_ids_to_tokens(stop_token_id)}"\n'
+
+        modelfile += "PARAMETER num_ctx 4096\n"
+        return modelfile
+
+
+@dataclass
+class Llama2Template(Template):
+    r"""A template that fuse the system message to first user message."""
+
+    @override
+    def _encode(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: str,
+        tools: str,
+    ) -> list[list[int]]:
+        system = system or self.default_system
+        encoded_messages = []
+        for i, message in enumerate(messages):
+            elements = []
+
+            system_text = ""
+            if i == 0:
+                elements += self.format_prefix.apply()
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    system_text = self.format_system.apply(content=(system + tool_text))[0]
+
+            if message["role"] == Role.USER:
+                elements += self.format_user.apply(content=system_text + message["content"])
+            elif message["role"] == Role.ASSISTANT:
+                elements += self.format_assistant.apply(content=message["content"])
+            elif message["role"] == Role.OBSERVATION:
+                elements += self.format_observation.apply(content=message["content"])
+            elif message["role"] == Role.FUNCTION:
+                elements += self.format_function.apply(content=message["content"])
+            else:
+                raise NotImplementedError("Unexpected role: {}".format(message["role"]))
+
+            encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
+
+        return encoded_messages
+
+    def _get_jinja_template(self, tokenizer: "PreTrainedTokenizer") -> str:
+        prefix = self._convert_slots_to_jinja(self.format_prefix.apply(), tokenizer)
+        system_message = self._convert_slots_to_jinja(
+            self.format_system.apply(), tokenizer, placeholder="system_message"
+        )
+        user_message = self._convert_slots_to_jinja(self.format_user.apply(), tokenizer)
+        assistant_message = self._convert_slots_to_jinja(self.format_assistant.apply(), tokenizer)
+        jinja_template = ""
+        if prefix:
+            jinja_template += "{{ " + prefix + " }}"
+
+        if self.default_system:
+            jinja_template += "{% set system_message = '" + self._jinja_escape(self.default_system) + "' %}"
+
+        jinja_template += (
+            "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}"
+            "{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}"
+            "{% for message in loop_messages %}"
+            "{% if loop.index0 == 0 and system_message is defined %}"
+            "{% set content = " + system_message + " + message['content'] %}"
+            "{% else %}{% set content = message['content'] %}{% endif %}"
+            "{% if message['role'] == 'user' %}"
+            "{{ " + user_message + " }}"
+            "{% elif message['role'] == 'assistant' %}"
+            "{{ " + assistant_message + " }}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        return jinja_template
+
+
+@dataclass
+class ReasoningTemplate(Template):
+    r"""A template that add thought to assistant message."""
+
+    @override
+    def encode_oneturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> tuple[list[int], list[int]]:
+        messages = deepcopy(messages)
+        for i in range(1, len(messages) - 2, 2):
+            messages[i]["content"] = self.remove_thought(messages[i]["content"])
+
+        if self.enable_thinking is False:  # remove all cot
+            messages[-1]["content"] = self.remove_thought(messages[-1]["content"])
+
+        prompt_ids, response_ids = super().encode_oneturn(tokenizer, messages, system, tools)
+        if (
+            self.thought_words[0].strip() not in messages[-1]["content"]
+            and self.thought_words[1].strip() not in messages[-1]["content"]
+        ):  # add empty cot
+            if not self.enable_thinking:  # do not compute loss
+                prompt_ids += self.get_thought_word_ids(tokenizer)
+            else:  # do compute loss
+                response_ids = self.get_thought_word_ids(tokenizer) + response_ids
+
+        return prompt_ids, response_ids
+
+    @override
+    def encode_multiturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> list[tuple[list[int], list[int]]]:
+        messages = deepcopy(messages)
+        if self.enable_thinking is False:  # remove all cot
+            for i in range(1, len(messages), 2):
+                messages[i]["content"] = self.remove_thought(messages[i]["content"])
+
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        for i in range(0, len(messages), 2):
+            if (
+                self.thought_words[0].strip() not in messages[i + 1]["content"]
+                and self.thought_words[1].strip() not in messages[i + 1]["content"]
+            ):  # add empty cot
+                if not self.enable_thinking:  # do not compute loss
+                    encoded_messages[i] += self.get_thought_word_ids(tokenizer)
+                else:  # do compute loss
+                    encoded_messages[i + 1] = self.get_thought_word_ids(tokenizer) + encoded_messages[i + 1]
+
+        return [(encoded_messages[i], encoded_messages[i + 1]) for i in range(0, len(encoded_messages), 2)]
+
+
+TEMPLATES: dict[str, "Template"] = {}
+
+
+def register_template(
+    name: str,
+    format_user: Optional["Formatter"] = None,
+    format_assistant: Optional["Formatter"] = None,
+    format_system: Optional["Formatter"] = None,
+    format_function: Optional["Formatter"] = None,
+    format_observation: Optional["Formatter"] = None,
+    format_tools: Optional["Formatter"] = None,
+    format_prefix: Optional["Formatter"] = None,
+    default_system: str = "",
+    stop_words: Optional[list[str]] = None,
+    thought_words: Optional[tuple[str, str]] = None,
+    tool_call_words: Optional[tuple[str, str]] = None,
+    efficient_eos: bool = False,
+    replace_eos: bool = False,
+    replace_jinja_template: bool = False,
+    enable_thinking: Optional[bool] = True,
+    mm_plugin: "BasePlugin" = get_mm_plugin(name="base"),
+    template_class: type["Template"] = Template,
+) -> None:
+    r"""Register a chat template.
+
+    To add the following chat template:
+    ```
+    <s><user>user prompt here
+    <model>model response here</s>
+    <user>user prompt here
+    <model>model response here</s>
+    ```
+
+    The corresponding code should be:
+    ```
+    register_template(
+        name="custom",
+        format_user=StringFormatter(slots=["<user>{{content}}\n<model>"]),
+        format_assistant=StringFormatter(slots=["{{content}}</s>\n"]),
+        format_prefix=EmptyFormatter("<s>"),
+    )
+    ```
+    """
+    if name in TEMPLATES:
+        raise ValueError(f"Template {name} already exists.")
+
+    default_slots = ["{{content}}"] if efficient_eos else ["{{content}}", {"eos_token"}]
+    default_user_formatter = StringFormatter(slots=["{{content}}"])
+    default_assistant_formatter = StringFormatter(slots=default_slots)
+    if format_assistant is not None:
+        default_function_formatter = FunctionFormatter(slots=format_assistant.slots, tool_format="default")
+    else:
+        default_function_formatter = FunctionFormatter(slots=default_slots, tool_format="default")
+
+    default_tool_formatter = ToolFormatter(tool_format="default")
+    default_prefix_formatter = EmptyFormatter()
+    TEMPLATES[name] = template_class(
+        format_user=format_user or default_user_formatter,
+        format_assistant=format_assistant or default_assistant_formatter,
+        format_system=format_system or default_user_formatter,
+        format_function=format_function or default_function_formatter,
+        format_observation=format_observation or format_user or default_user_formatter,
+        format_tools=format_tools or default_tool_formatter,
+        format_prefix=format_prefix or default_prefix_formatter,
+        default_system=default_system,
+        stop_words=stop_words or [],
+        thought_words=thought_words or ("<think>\n", "\n</think>\n\n"),
+        tool_call_words=tool_call_words or ("<tool_call>", "</tool_call>"),
+        efficient_eos=efficient_eos,
+        replace_eos=replace_eos,
+        replace_jinja_template=replace_jinja_template,
+        enable_thinking=enable_thinking,
+        mm_plugin=mm_plugin,
+    )
+
+
+def parse_template(tokenizer: "PreTrainedTokenizer") -> "Template":
+    r"""Extract a chat template from the tokenizer."""
+
+    def find_diff(short_str: str, long_str: str) -> str:
+        i, j = 0, 0
+        diff = ""
+        while i < len(short_str) and j < len(long_str):
+            if short_str[i] == long_str[j]:
+                i += 1
+                j += 1
+            else:
+                diff += long_str[j]
+                j += 1
+
+        return diff
+
+    prefix = tokenizer.decode(tokenizer.encode(""))
+
+    messages = [{"role": "system", "content": "{{content}}"}]
+    system_slot = tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)[len(prefix) :]
+
+    messages = [{"role": "system", "content": ""}, {"role": "user", "content": "{{content}}"}]
+    user_slot_empty_system = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+    user_slot_empty_system = user_slot_empty_system[len(prefix) :]
+
+    messages = [{"role": "user", "content": "{{content}}"}]
+    user_slot = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+    user_slot = user_slot[len(prefix) :]
+
+    messages = [{"role": "user", "content": "{{content}}"}, {"role": "assistant", "content": "{{content}}"}]
+    assistant_slot = tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)
+    assistant_slot = assistant_slot[len(prefix) + len(user_slot) :]
+    template_class = ReasoningTemplate if "<think>" in assistant_slot else Template
+    assistant_slot = assistant_slot.replace("<think>", "").replace("</think>", "").lstrip("\n")  # remove thought tags
+
+    if len(user_slot) > len(user_slot_empty_system):
+        default_system = find_diff(user_slot_empty_system, user_slot)
+        sole_system = system_slot.replace("{{content}}", default_system, 1)
+        user_slot = user_slot[len(sole_system) :]
+    else:  # if defaut_system is empty, user_slot_empty_system will be longer than user_slot
+        default_system = ""
+
+    return template_class(
+        format_user=StringFormatter(slots=[user_slot]),
+        format_assistant=StringFormatter(slots=[assistant_slot]),
+        format_system=StringFormatter(slots=[system_slot]),
+        format_function=FunctionFormatter(slots=[assistant_slot], tool_format="default"),
+        format_observation=StringFormatter(slots=[user_slot]),
+        format_tools=ToolFormatter(tool_format="default"),
+        format_prefix=EmptyFormatter(slots=[prefix]) if prefix else EmptyFormatter(),
+        default_system=default_system,
+        stop_words=[],
+        thought_words=("<think>\n", "\n</think>\n\n"),
+        tool_call_words=("<tool_call>", "</tool_call>"),
+        efficient_eos=False,
+        replace_eos=False,
+        replace_jinja_template=False,
+        enable_thinking=True,
+        mm_plugin=get_mm_plugin(name="base"),
+    )
+
+
+def get_template_and_fix_tokenizer(tokenizer: "PreTrainedTokenizer", data_args: "DataArguments") -> "Template":
+    r"""Get chat template and fixes the tokenizer."""
+    if data_args.template is None:
+        if isinstance(tokenizer.chat_template, str):
+            logger.warning_rank0("`template` was not specified, try parsing the chat template from the tokenizer.")
+            template = parse_template(tokenizer)
+        else:
+            logger.warning_rank0("`template` was not specified, use `empty` template.")
+            template = TEMPLATES["empty"]  # placeholder
+    else:
+        if data_args.template not in TEMPLATES:
+            raise ValueError(f"Template {data_args.template} does not exist.")
+
+        template = TEMPLATES[data_args.template]
+
+    if data_args.train_on_prompt and template.efficient_eos:
+        raise ValueError("Current template does not support `train_on_prompt`.")
+
+    if data_args.tool_format is not None:
+        logger.info_rank0(f"Using tool format: {data_args.tool_format}.")
+        default_slots = ["{{content}}"] if template.efficient_eos else ["{{content}}", {"eos_token"}]
+        template.format_function = FunctionFormatter(slots=default_slots, tool_format=data_args.tool_format)
+        template.format_tools = ToolFormatter(tool_format=data_args.tool_format)
+
+    if data_args.default_system is not None:
+        logger.info_rank0(f"Using default system message: {data_args.default_system}.")
+        template.default_system = data_args.default_system
+
+    if isinstance(template, ReasoningTemplate):
+        logger.warning_rank0(
+            "You are using reasoning template, "
+            "please add `_nothink` suffix if the model is not a reasoning model. "
+            "e.g., qwen3_vl_nothink"
+        )
+        template.enable_thinking = data_args.enable_thinking
+
+    template.fix_special_tokens(tokenizer)
+    template.fix_jinja_template(tokenizer)
+    return template
+
+
+register_template(
+    name="alpaca",
+    format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}, "\n\n"]),
+    default_system=(
+        "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
+    ),
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="bailing",
+    format_user=StringFormatter(slots=["<role>HUMAN</role>{{content}}<role>ASSISTANT</role>"]),
+    format_system=StringFormatter(slots=["<role>SYSTEM</role>{{content}}"]),
+    format_observation=StringFormatter(slots=["<role>OBSERVATION</role>{{content}}<role>ASSISTANT</role>"]),
+    stop_words=["<|endoftext|>"],
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="bailing_v2",
+    format_user=StringFormatter(slots=["<role>HUMAN</role>{{content}}<|role_end|><role>ASSISTANT</role>"]),
+    format_system=StringFormatter(slots=["<role>SYSTEM</role>{{content}}<|role_end|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|role_end|>"]),
+    format_observation=StringFormatter(
+        slots=[
+            "<role>OBSERVATION</role>\n<tool_response>\n{{content}}\n</tool_response><|role_end|><role>ASSISTANT</role>"
+        ]
+    ),
+    format_function=FunctionFormatter(slots=["{{content}}<|role_end|>"], tool_format="ling"),
+    format_tools=ToolFormatter(tool_format="ling"),
+    stop_words=["<|endoftext|>"],
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="breeze",
+    format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="chatglm3",
+    format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
+    format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
+    format_system=StringFormatter(slots=[{"token": "<|system|>"}, "\n", "{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4"),
+    format_observation=StringFormatter(
+        slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
+    ),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="chatml",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    stop_words=["<|im_end|>", "<|im_start|>"],
+    replace_eos=True,
+    replace_jinja_template=True,
+)
+
+
+# copied from chatml template
+register_template(
+    name="chatml_de",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
+    stop_words=["<|im_end|>", "<|im_start|>"],
+    replace_eos=True,
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="cohere",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>"
+                "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+            )
+        ]
+    ),
+    format_system=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+# copied from chatml template
+register_template(
+    name="cpm4",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|im_end|>"],
+)
+
+
+# copied from chatml template
+register_template(
+    name="dbrx",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    default_system=(
+        "You are DBRX, created by Databricks. You were last updated in December 2023. "
+        "You answer questions based on information available up to that point.\n"
+        "YOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough "
+        "responses to more complex and open-ended questions.\nYou assist with various tasks, "
+        "from writing to coding (using markdown for code blocks — remember to use ``` with "
+        "code, JSON, and tables).\n(You do not have real-time data access or code execution "
+        "capabilities. You avoid stereotyping and provide balanced perspectives on "
+        "controversial topics. You do not provide song lyrics, poems, or news articles and "
+        "do not divulge details of your training data.)\nThis is your system prompt, "
+        "guiding your responses. Do not reference it, just respond to the user. If you find "
+        "yourself talking about this message, stop. You should be responding appropriately "
+        "and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION "
+        "ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY."
+    ),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+register_template(
+    name="deepseek",
+    format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+register_template(
+    name="deepseek3",
+    format_user=StringFormatter(slots=["<｜User｜>{{content}}<｜Assistant｜>"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+# copied from deepseek3 template
+register_template(
+    name="deepseekr1",
+    format_user=StringFormatter(slots=["<｜User｜>{{content}}<｜Assistant｜>"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="deepseekcoder",
+    format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}\n<|EOT|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    default_system=(
+        "You are an AI programming assistant, utilizing the DeepSeek Coder model, "
+        "developed by DeepSeek Company, and you only answer questions related to computer science. "
+        "For politically sensitive questions, security and privacy issues, "
+        "and other non-computer science questions, you will refuse to answer.\n"
+    ),
+)
+
+
+register_template(
+    name="default",
+    format_user=StringFormatter(slots=["Human: {{content}}", {"eos_token"}, "\nAssistant:"]),
+    format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}, "\n"]),
+    format_system=StringFormatter(slots=["System: {{content}}", {"eos_token"}, "\n"]),
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="dots_ocr",
+    format_user=StringFormatter(slots=["<|user|>{{content}}<|endofuser|><|assistant|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|endofassistant|>"]),
+    format_system=StringFormatter(slots=["<|system|>{{content}}<|endofsystem|>\n"]),
+    stop_words=["<|endofassistant|>"],
+    efficient_eos=True,
+    mm_plugin=get_mm_plugin(
+        name="qwen2_vl",
+        image_token="<|imgpad|>",
+        video_token="<|vidpad|>",
+        vision_bos_token="<|img|>",
+        vision_eos_token="<|endofimg|>",
+    ),
+)
+
+
+register_template(
+    name="empty",
+    format_assistant=StringFormatter(slots=["{{content}}"]),
+)
+
+
+# copied from chatml template
+register_template(
+    name="ernie",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n\n<|im_start|>assistant\n"]),
+    default_system="<global_setting>\nthink_mode=True\n</global_setting>",
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="ernie_nothink",
+    format_user=StringFormatter(slots=["User: {{content}}\nAssistant: "]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end_of_sentence|>"]),
+    format_system=StringFormatter(slots=["{{content}}\n"]),
+    format_prefix=EmptyFormatter(slots=["<|begin_of_sentence|>"]),
+    stop_words=["<|end_of_sentence|>"],
+)
+
+
+register_template(
+    name="ernie_vl",
+    format_user=StringFormatter(slots=["User: {{content}}"]),
+    format_assistant=StringFormatter(slots=["\nAssistant: {{content}}<|end_of_sentence|>"]),
+    format_system=StringFormatter(slots=["{{content}}\n"]),
+    stop_words=["<|end_of_sentence|>"],
+    replace_eos=True,
+    replace_jinja_template=True,
+    template_class=ReasoningTemplate,
+    mm_plugin=get_mm_plugin(name="ernie_vl", image_token="<|IMAGE_PLACEHOLDER|>", video_token="<|VIDEO_PLACEHOLDER|>"),
+)
+
+
+register_template(
+    name="exaone",
+    format_user=StringFormatter(slots=["[|user|]{{content}}\n[|assistant|]"]),
+    format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}, "\n"]),
+    format_system=StringFormatter(slots=["[|system|]{{content}}[|endofturn|]\n"]),
+)
+
+
+register_template(
+    name="falcon",
+    format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]),
+    format_assistant=StringFormatter(slots=["{{content}}\n"]),
+    efficient_eos=True,
+)
+
+
+# copied from chatml template
+register_template(
+    name="falcon_h1",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|im_end|>", "<|end_of_text|>"],
+)
+
+
+register_template(
+    name="fewshot",
+    format_assistant=StringFormatter(slots=["{{content}}\n\n"]),
+    efficient_eos=True,
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="gemma",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<end_of_turn>"],
+    replace_eos=True,
+    template_class=Llama2Template,
+)
+
+
+# copied from gemma template
+register_template(
+    name="gemma2",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<eos>", "<end_of_turn>"],
+    efficient_eos=True,
+    template_class=Llama2Template,
+)
+
+
+# copied from gemma template
+register_template(
+    name="gemma3",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<end_of_turn>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin("gemma3", image_token="<image_soft_token>"),
+    template_class=Llama2Template,
+)
+
+
+register_template(
+    name="gemma3n",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<end_of_turn>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin("gemma3n", image_token="<image_soft_token>", audio_token="<audio_soft_token>"),
+    template_class=Llama2Template,
+)
+
+
+register_template(
+    name="glm4",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+)
+
+
+# copied from glm4 template
+register_template(
+    name="glm4_moe",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4_moe"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4_moe"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from glm4 template
+register_template(
+    name="glm4v",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>", "</answer>"],
+    efficient_eos=True,
+    mm_plugin=get_mm_plugin(name="glm4v", image_token="<|image|>", video_token="<|video|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from glm4 template
+register_template(
+    name="glm4_5v",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4_moe"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4_moe"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>", "</answer>"],
+    efficient_eos=True,
+    mm_plugin=get_mm_plugin(name="glm4v", image_token="<|image|>", video_token="<|video|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from glm4 template
+register_template(
+    name="glmz1",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="gpt_oss",
+    format_user=StringFormatter(slots=["<|start|>user<|message|>{{content}}<|end|><|start|>assistant"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end|>"]),
+    format_system=StringFormatter(slots=["<|start|>system<|message|>{{content}}<|end|>"]),
+    default_system="You are ChatGPT, a large language model trained by OpenAI.",
+    thought_words=("<|channel|>analysis<|message|>", "<|end|><|start|>assistant<|channel|>final<|message|>"),
+    efficient_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="granite3",
+    format_user=StringFormatter(
+        slots=[
+            "<|start_of_role|>user<|end_of_role|>{{content}}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|end_of_text|>\n"]),
+    format_system=StringFormatter(slots=["<|start_of_role|>system<|end_of_role|>{{content}}<|end_of_text|>\n"]),
+)
+
+
+register_template(
+    name="granite3_vision",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}\n<|assistant|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}\n"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+register_template(
+    name="granite4",
+    format_user=StringFormatter(
+        slots=[
+            "<|start_of_role|>user<|end_of_role|>{{content}}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|end_of_text|>\n"]),
+    format_system=StringFormatter(slots=["<|start_of_role|>system<|end_of_role|>{{content}}<|end_of_text|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|end_of_text|>\n"], tool_format="default"),
+    format_observation=StringFormatter(
+        slots=["<|start_of_role|>tool<|end_of_role|>{{content}}<|end_of_text|>\n<|start_of_role|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="default"),
+    stop_words=["<|end_of_text|>"],
+    default_system="You are Granite, developed by IBM. You are a helpful AI assistant.",
+)
+
+
+register_template(
+    name="index",
+    format_user=StringFormatter(slots=["reserved_0{{content}}reserved_1"]),
+    format_system=StringFormatter(slots=["<unk>{{content}}"]),
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="hunyuan",
+    format_user=StringFormatter(slots=["{{content}}<|extra_0|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|eos|>"]),
+    format_system=StringFormatter(slots=["{{content}}<|extra_4|>"]),
+    format_prefix=EmptyFormatter(slots=["<|startoftext|>"]),
+    stop_words=["<|eos|>"],
+)
+
+
+register_template(
+    name="hunyuan_small",
+    format_user=StringFormatter(slots=["<｜hy_User｜>{{content}}<｜hy_place▁holder▁no▁8｜>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<｜hy_place▁holder▁no▁2｜>"]),
+    format_system=StringFormatter(slots=["{{content}}<｜hy_place▁holder▁no▁3｜>"]),
+    format_prefix=EmptyFormatter(slots=["<｜hy_begin▁of▁sentence｜>"]),
+    stop_words=["<｜hy_place▁holder▁no▁2｜>"],
+)
+
+
+register_template(
+    name="intern2",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    default_system=(
+        "You are an AI assistant whose name is InternLM (书生·浦语).\n"
+        "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory "
+        "(上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
+        "- InternLM (书生·浦语) can understand and communicate fluently in the language "
+        "chosen by the user such as English and 中文."
+    ),
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="intern_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    default_system=(
+        "你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。"
+    ),
+    stop_words=["<|im_end|>"],
+    mm_plugin=get_mm_plugin(name="intern_vl", image_token="<image>", video_token="<video>"),
+)
+
+
+register_template(
+    name="intern_s1",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|im_end|>"],
+    mm_plugin=get_mm_plugin(name="intern_vl", image_token="<image>", video_token="<video>"),
+)
+
+
+# copied from qwen template
+register_template(
+    name="keye_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="kimi_vl",
+    format_user=StringFormatter(
+        slots=["<|im_user|>user<|im_middle|>{{content}}<|im_end|><|im_assistant|>assistant<|im_middle|>"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>"]),
+    format_system=StringFormatter(slots=["<|im_system|>system<|im_middle|>{{content}}<|im_end|>"]),
+    default_system="You are a helpful assistant",
+    stop_words=["<|im_end|>"],
+    thought_words=("◁think▷", "◁/think▷"),
+    mm_plugin=get_mm_plugin("kimi_vl", image_token="<|media_pad|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="lfm2",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="lfm2"),
+    format_observation=StringFormatter(
+        slots=[
+            "<|im_start|>tool\n<|tool_response_start|>{{content}}<|tool_response_end|><|im_end|>\n"
+            "<|im_start|>assistant\n"
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="lfm2"),
+    default_system="You are a helpful AI assistant.",
+    stop_words=["<|im_end|>"],
+    tool_call_words=("<|tool_call_start|>", "<|tool_call_end|>"),
+    replace_eos=True,
+)
+
+
+register_template(
+    name="lfm2_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="lfm2"),
+    format_observation=StringFormatter(
+        slots=[
+            "<|im_start|>tool\n<|tool_response_start|>{{content}}<|tool_response_end|><|im_end|>\n"
+            "<|im_start|>assistant\n"
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="lfm2"),
+    default_system="You are a helpful multimodal assistant by Liquid AI.",
+    stop_words=["<|im_end|>"],
+    tool_call_words=("<|tool_call_start|>", "<|tool_call_end|>"),
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="lfm2_vl", image_token="<image>"),
+)
+
+
+register_template(
+    name="llama2",
+    format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
+    format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
+    template_class=Llama2Template,
+)
+
+
+# copied from llama2 template
+register_template(
+    name="llama2_zh",
+    format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
+    format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
+    default_system="You are a helpful assistant. 你是一个乐于助人的助手。",
+    template_class=Llama2Template,
+)
+
+
+register_template(
+    name="llama3",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|eot_id|>"]),
+    format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|eot_id|>"], tool_format="llama3"),
+    format_observation=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>ipython<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="llama3"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>", "<|eom_id|>"],
+    replace_eos=True,
+)
+
+
+register_template(
+    name="llama4",
+    format_user=StringFormatter(
+        slots=["<|header_start|>user<|header_end|>\n\n{{content}}<|eot|><|header_start|>assistant<|header_end|>\n\n"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|eot|>"]),
+    format_system=StringFormatter(slots=["<|header_start|>system<|header_end|>\n\n{{content}}<|eot|>"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|eot|>"], tool_format="llama3"),
+    format_observation=StringFormatter(
+        slots=[
+            "<|header_start|>ipython<|header_end|>\n\n{{content}}<|eot|><|header_start|>assistant<|header_end|>\n\n"
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="llama3"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot|>", "<|eom|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="llama4", image_token="<|image|>"),
+)
+
+
+# copied from llama3 template
+register_template(
+    name="mllama",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|eot_id|>"]),
+    format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|eot_id|>"], tool_format="llama3"),
+    format_observation=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>ipython<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="llama3"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>", "<|eom_id|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="mllama", image_token="<|image|>"),
+)
+
+
+register_template(
+    name="moonlight",
+    format_user=StringFormatter(
+        slots=["<|im_user|>user<|im_middle|>{{content}}<|im_end|><|im_assistant|>assistant<|im_middle|>"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>"]),
+    format_system=StringFormatter(slots=["<|im_system|>system<|im_middle|>{{content}}<|im_end|>"]),
+    default_system="You are a helpful assistant provided by Moonshot-AI.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+# copied from vicuna template
+register_template(
+    name="llava",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava", image_token="<image>"),
+)
+
+
+# copied from vicuna template
+register_template(
+    name="llava_next",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+# copied from llama3 template
+register_template(
+    name="llava_next_llama3",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|eot_id|>"]),
+    format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|eot_id|>"], tool_format="llama3"),
+    format_observation=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>ipython<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="llama3"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>", "<|eom_id|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+# copied from mistral template
+register_template(
+    name="llava_next_mistral",
+    format_user=StringFormatter(slots=["[INST] {{content}}[/INST]"]),
+    format_assistant=StringFormatter(slots=[" {{content}}", {"eos_token"}]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS] {{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS] {"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+    template_class=Llama2Template,
+)
+
+
+# copied from qwen template
+register_template(
+    name="llava_next_qwen",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+# copied from chatml template
+register_template(
+    name="llava_next_yi",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+# copied from vicuna template
+register_template(
+    name="llava_next_video",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>"),
+)
+
+
+# copied from mistral template
+register_template(
+    name="llava_next_video_mistral",
+    format_user=StringFormatter(slots=["[INST] {{content}}[/INST]"]),
+    format_assistant=StringFormatter(slots=[" {{content}}", {"eos_token"}]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS] {{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS] {"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>"),
+    template_class=Llama2Template,
+)
+
+
+# copied from chatml template
+register_template(
+    name="llava_next_video_yi",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    mm_plugin=get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>"),
+)
+
+
+# copied from qwen template
+register_template(
+    name="mimo",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from qwen template
+register_template(
+    name="mimo_v2",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are MiMo, a helpful AI assistant engineered by Xiaomi.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    thought_words=("<think>", "</think>"),
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from qwen2vl
+register_template(
+    name="mimo_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are MiMo, an AI assistant developed by Xiaomi.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from chatml template
+register_template(
+    name="minicpm_v",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    default_system="You are a helpful assistant.",
+    mm_plugin=get_mm_plugin(name="minicpm_v", image_token="<image>", video_token="<video>"),
+)
+
+
+# copied from minicpm_v template
+register_template(
+    name="minicpm_o",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    default_system="You are a helpful assistant. You can accept audio and text input and output voice and text.",
+    mm_plugin=get_mm_plugin(name="minicpm_v", image_token="<image>", video_token="<video>", audio_token="<audio>"),
+)
+
+
+register_template(
+    name="minimax1",
+    format_user=StringFormatter(
+        slots=[
+            "<beginning_of_sentence>user name=user\n{{content}}<end_of_sentence>\n<beginning_of_sentence>ai name=assistant\n"
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_sentence>\n"]),
+    format_system=StringFormatter(
+        slots=["<beginning_of_sentence>system ai_setting=assistant\n{{content}}<end_of_sentence>\n"]
+    ),
+    format_function=FunctionFormatter(slots=["{{content}}<end_of_sentence>\n"], tool_format="minimax1"),
+    format_observation=StringFormatter(
+        slots=[
+            "<beginning_of_sentence>tool name=tools\n{{content}}<end_of_sentence>\n<beginning_of_sentence>ai name=assistant\n"
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="minimax1"),
+    default_system="You are a helpful assistant.",
+    stop_words=["<end_of_sentence>"],
+)
+
+
+register_template(
+    name="minimax2",
+    format_user=StringFormatter(slots=["]~b]user\n{{content}}[e~[\n]~b]ai\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}[e~[\n"]),
+    format_system=StringFormatter(slots=["]~!b[]~b]system\n{{content}}[e~[\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}[e~[\n"], tool_format="minimax2"),
+    format_observation=StringFormatter(slots=["]~b]tool\n<response>{{content}}</response>[e~[\n]~b]ai\n"]),
+    format_tools=ToolFormatter(tool_format="minimax2"),
+    default_system="You are a helpful assistant. Your name is MiniMax-M2.1 and is built by MiniMax.",
+    stop_words=["[e~["],
+    template_class=ReasoningTemplate,
+)
+
+
+# mistral tokenizer v3 tekken
+register_template(
+    name="ministral",
+    format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS]{{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS]{"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    template_class=Llama2Template,
+)
+
+
+# mistral tokenizer v3
+register_template(
+    name="mistral",
+    format_user=StringFormatter(slots=["[INST] {{content}}[/INST]"]),
+    format_assistant=StringFormatter(slots=[" {{content}}", {"eos_token"}]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS] {{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS] {"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    template_class=Llama2Template,
+)
+
+
+# mistral tokenizer v7 tekken (copied from ministral)
+register_template(
+    name="mistral_small",
+    format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
+    format_system=StringFormatter(slots=["[SYSTEM_PROMPT]{{content}}[/SYSTEM_PROMPT]"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS]{{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS]{"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]"),
+)
+
+
+register_template(
+    name="ministral3",
+    format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS]{{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS]{"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    template_class=Llama2Template,
+    mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]"),
+)
+
+
+register_template(
+    name="olmo",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"eos_token"}]),
+)
+
+
+register_template(
+    name="openchat",
+    format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+register_template(
+    name="openchat-3.6",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>GPT4 Correct User<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>"],
+)
+
+
+# copied from chatml template
+register_template(
+    name="opencoder",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    default_system="You are OpenCoder, created by OpenCoder Team.",
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="paligemma",
+    format_user=StringFormatter(slots=["{{content}}\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="paligemma", image_token="<image>"),
+    template_class=Llama2Template,
+)
+
+
+# copied from gemma template
+register_template(
+    name="paligemma_chat",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<end_of_turn>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="paligemma", image_token="<image>"),
+    template_class=Llama2Template,
+)
+
+
+register_template(
+    name="phi",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}<|end|>\n"]),
+    stop_words=["<|end|>"],
+    replace_eos=True,
+)
+
+
+register_template(
+    name="phi_small",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}<|end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"<|endoftext|>"}]),
+    stop_words=["<|end|>"],
+    replace_eos=True,
+)
+
+
+register_template(
+    name="phi4",
+    format_user=StringFormatter(
+        slots=["<|im_start|>user<|im_sep|>{{content}}<|im_end|><|im_start|>assistant<|im_sep|>"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>"]),
+    format_system=StringFormatter(slots=["<|im_start|>system<|im_sep|>{{content}}<|im_end|>"]),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+register_template(
+    name="phi4_mini",
+    format_user=StringFormatter(slots=["<|user|>{{content}}<|end|><|assistant|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end|>"]),
+    format_system=StringFormatter(slots=["<|system|>{{content}}<|end|>"]),
+    format_tools=StringFormatter(slots=["<|tool|>{{content}}<|/tool|>"]),
+    stop_words=["<|end|>"],
+    replace_eos=True,
+)
+
+
+# copied from ministral template
+register_template(
+    name="pixtral",
+    format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS]{{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS]{"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]"),
+    template_class=Llama2Template,
+)
+
+
+# copied from chatml template
+register_template(
+    name="qwen",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen3",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen3_nothink",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>", "<think>", "</think>"],
+    replace_eos=True,
+)
+
+
+# copied from chatml template
+register_template(
+    name="qwen2_audio",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen2_audio", audio_token="<|AUDIO|>"),
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen2_omni",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(
+        name="qwen2_omni",
+        image_token="<|IMAGE|>",
+        video_token="<|VIDEO|>",
+        audio_token="<|AUDIO|>",
+        vision_bos_token="<|vision_bos|>",
+        vision_eos_token="<|vision_eos|>",
+        audio_bos_token="<|audio_bos|>",
+        audio_eos_token="<|audio_eos|>",
+    ),
+)
+
+
+register_template(
+    name="qwen3_omni",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(
+        name="qwen2_omni", image_token="<|image_pad|>", video_token="<|video_pad|>", audio_token="<|audio_pad|>"
+    ),
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="qwen3_omni_nothink",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(
+        name="qwen2_omni", image_token="<|image_pad|>", video_token="<|video_pad|>", audio_token="<|audio_pad|>"
+    ),
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen2_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen3_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen3_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen3_vl_nothink",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen3_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+)
+
+
+register_template(
+    name="sailor",
+    format_user=StringFormatter(slots=["<|im_start|>question\n{{content}}<|im_end|>\n<|im_start|>answer\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    default_system=(
+        "You are an AI assistant named Sailor created by Sea AI Lab. "
+        "Your answer should be friendly, unbiased, faithful, informative and detailed."
+    ),
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="seed_coder",
+    format_user=StringFormatter(
+        slots=[{"bos_token"}, "user\n{{content}}", {"eos_token"}, {"bos_token"}, "assistant\n"]
+    ),
+    format_system=StringFormatter(slots=[{"bos_token"}, "system\n{{content}}", {"eos_token"}]),
+    default_system=(
+        "You are an AI programming assistant, utilizing the Seed-Coder model, developed by ByteDance Seed, "
+        "and you only answer questions related to computer science. For politically sensitive questions, "
+        "security and privacy issues, and other non-computer science questions, you will refuse to answer.\n\n"
+    ),
+)
+
+
+# copied from seed_coder
+register_template(
+    name="seed_oss",
+    format_user=StringFormatter(
+        slots=[{"bos_token"}, "user\n{{content}}", {"eos_token"}, {"bos_token"}, "assistant\n"]
+    ),
+    format_system=StringFormatter(slots=[{"bos_token"}, "system\n{{content}}", {"eos_token"}]),
+    format_function=FunctionFormatter(slots=[{"bos_token"}, "\n{{content}}", {"eos_token"}], tool_format="seed_oss"),
+    format_tools=ToolFormatter(tool_format="seed_oss"),
+    template_class=ReasoningTemplate,
+    thought_words=("<seed:think>", "</seed:think>"),
+)
+
+
+register_template(
+    name="smollm",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="smollm2",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    default_system="You are a helpful AI assistant named SmolLM, trained by Hugging Face.",
+)
+
+
+register_template(
+    name="solar",
+    format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]),
+    format_system=StringFormatter(slots=["### System:\n{{content}}\n\n"]),
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="starchat",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}<|end|>\n"]),
+    stop_words=["<|end|>"],
+)
+
+
+register_template(
+    name="telechat2",
+    format_user=StringFormatter(slots=["<_user>{{content}}<_bot>"]),
+    format_system=StringFormatter(slots=["<_system>{{content}}"]),
+    default_system=(
+        "你是中国电信星辰语义大模型，英文名是TeleChat，你是由中电信人工智能科技有限公司和中国电信人工智能研究院（TeleAI）研发的人工智能助手。"
+    ),
+)
+
+
+register_template(
+    name="vicuna",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="video_llava",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="video_llava", image_token="<image>", video_token="<video>"),
+)
+
+
+register_template(
+    name="xuanyuan",
+    format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
+    default_system=(
+        "以下是用户和人工智能助手之间的对话。用户以Human开头，人工智能助手以Assistant开头，"
+        "会对人类提出的问题给出有帮助、高质量、详细和礼貌的回答，并且总是拒绝参与与不道德、"
+        "不安全、有争议、政治敏感等相关的话题、问题和指示。\n"
+    ),
+)
+
+
+# copied from chatml template
+register_template(
+    name="yi",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="yi_vl",
+    format_user=StringFormatter(slots=["### Human: {{content}}\n### Assistant:"]),
+    format_assistant=StringFormatter(slots=["{{content}}\n"]),
+    default_system=(
+        "This is a chat between an inquisitive human and an AI assistant. "
+        "Assume the role of the AI assistant. Read all the images carefully, "
+        "and respond to the human's questions with informative, helpful, detailed and polite answers. "
+        "这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。"
+        "仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n\n"
+    ),
+    stop_words=["###"],
+    efficient_eos=True,
+    mm_plugin=get_mm_plugin(name="llava", image_token="<image>"),
+)
+
+
+register_template(
+    name="youtu",
+    format_user=StringFormatter(slots=["<|User|>{{content}}<|Assistant|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end_of_text|>"]),
+    format_system=StringFormatter(slots=["{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="default"),
+    format_observation=StringFormatter(slots=["<tool_response>\n{{content}}\n</tool_response><|Assistant|>"]),
+    format_tools=ToolFormatter(tool_format="default"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|end_of_text|>"],
+    replace_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="youtu_vl",
+    format_user=StringFormatter(
+        slots=["<|begin_of_text|>user\n{{content}}<|end_of_text|>\n<|begin_of_text|>assistant\n"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|end_of_text|>\n"]),
+    format_system=StringFormatter(slots=["<|begin_of_text|>system\n{{content}}<|end_of_text|>\n"]),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|end_of_text|>"],
+    mm_plugin=get_mm_plugin(name="youtu_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+)
+
+
+register_template(
+    name="yuan",
+    format_user=StringFormatter(slots=["{{content}}", {"token": "<sep>"}]),
+    format_assistant=StringFormatter(slots=["{{content}}<eod>\n"]),
+    stop_words=["<eod>"],
+)
+
+
+register_template(
+    name="zephyr",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]),
+    default_system="You are Zephyr, a helpful assistant.",
+)
diff --git a/LlamaFactory/src/llamafactory/data/__init__.py b/LlamaFactory/src/llamafactory/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..11c8c9fcecd10e736e240196fde98f833c9df3dc
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .collator import (
+    KTODataCollatorWithPadding,
+    MultiModalDataCollatorForSeq2Seq,
+    PairwiseDataCollatorWithPadding,
+    SFTDataCollatorWith4DAttentionMask,
+)
+from .data_utils import Role, split_dataset
+from .loader import get_dataset
+from .template import TEMPLATES, Template, get_template_and_fix_tokenizer
+
+
+__all__ = [
+    "TEMPLATES",
+    "KTODataCollatorWithPadding",
+    "MultiModalDataCollatorForSeq2Seq",
+    "PairwiseDataCollatorWithPadding",
+    "Role",
+    "SFTDataCollatorWith4DAttentionMask",
+    "Template",
+    "get_dataset",
+    "get_template_and_fix_tokenizer",
+    "split_dataset",
+]
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..067f69d8fe92b08976f920e1b7e3bab6c03bd1af
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/__init__.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8dc3743cffbd0d5fbc4ed634688d433e389b139
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/__init__.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/collator.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/collator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3c51a42f86d5d34ddb43fe79f3d47d25e918a65
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/collator.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/collator.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/collator.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e86bf70c0f067553f1f571a57d6bd59ad5611b9
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/collator.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/converter.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/converter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20ed3c5245b14da447b4953a8b02a9a58f741fe4
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/converter.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/converter.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/converter.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56522decdf136669afff2b99fc38d3e0b7995fc6
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/converter.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/data_utils.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/data_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9470bc3bee351da34dbebf82d147216b3873c2da
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/data_utils.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/data_utils.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/data_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0dbf22c8a311a8720c6802184ac76f37617d97d
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/data_utils.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/formatter.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/formatter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..381bff8f7ac9221d9eed999180649480eb523de9
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/formatter.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/formatter.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/formatter.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1046893467cd33cafc20129b94bd11b411ddc328
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/formatter.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/loader.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/loader.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3657b243a562840a389586d6e869ea2f15eec32c
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/loader.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/loader.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/loader.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..348714cbd2b19dd59b2d65e4ec69106cab057ed1
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/loader.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/mm_plugin.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/mm_plugin.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3f81620eeeba0de179c2258d73536b64c5050d4
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/mm_plugin.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/parser.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/parser.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37550cf54b8ed29774410371f3d6bea0dc9161f9
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/parser.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/parser.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/parser.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22b1430fcef225acdf66e1f974adb7f8a514b9e0
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/parser.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/template.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/template.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..447c62db38aa50ed80534353f6ad2749ebb358b1
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/template.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/template.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/template.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a842a1f12b827a1d93cca69598f1e8e8e9f1468
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/template.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/tool_utils.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/tool_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b69b0ba74331141e9ae82390d4c81cb584c869b5
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/tool_utils.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/__pycache__/tool_utils.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/__pycache__/tool_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80f891aab554c78e9ce5d3982f267304bd1f40b4
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/__pycache__/tool_utils.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/collator.py b/LlamaFactory/src/llamafactory/data/collator.py
new file mode 100644
index 0000000000000000000000000000000000000000..162f432c9e5bf195ed4c6a821eb36d279bf3bac4
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/collator.py
@@ -0,0 +1,331 @@
+# Copyright 2025 OpenAccess AI Collective and the LlamaFactory team.
+#
+# This code is inspired by the OpenAccess AI Collective's axolotl library.
+# https://github.com/OpenAccess-AI-Collective/axolotl/blob/main/src/axolotl/monkeypatch/utils.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Literal, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from peft import PeftModel
+from transformers import DataCollatorForSeq2Seq
+
+from ..extras.constants import AUDIO_PLACEHOLDER, IGNORE_INDEX, IMAGE_PLACEHOLDER
+from ..extras.packages import is_pillow_available
+
+
+if is_pillow_available():
+    from PIL import Image
+
+
+if TYPE_CHECKING:
+    from transformers import ProcessorMixin
+
+    from .template import Template
+
+
+def prepare_4d_attention_mask(attention_mask_with_indices: "torch.Tensor", dtype: "torch.dtype") -> "torch.Tensor":
+    r"""Expand 2d attention mask to 4d attention mask.
+
+    Expand the attention mask with indices from (batch_size, seq_len) to (batch_size, 1, seq_len, seq_len),
+    handle packed sequences and transforms the mask to lower triangular form to prevent future peeking.
+
+    e.g.
+    ```python
+    # input
+    [[1, 1, 2, 2, 2, 0]]
+    # output
+    [
+        [
+            [
+                [o, x, x, x, x, x],
+                [o, o, x, x, x, x],
+                [x, x, o, x, x, x],
+                [x, x, o, o, x, x],
+                [x, x, o, o, o, x],
+                [x, x, x, x, x, x],
+            ]
+        ]
+    ]
+    ```
+    where `o` equals to `0.0`, `x` equals to `min_dtype`.
+    """
+    _, seq_len = attention_mask_with_indices.size()
+    min_dtype = torch.finfo(dtype).min
+    zero_tensor = torch.tensor(0, dtype=dtype)
+
+    # Create a non-padding mask.
+    non_padding_mask = (attention_mask_with_indices != 0).unsqueeze(1).unsqueeze(2)
+    # Create indices for comparison.
+    indices = attention_mask_with_indices.unsqueeze(1).unsqueeze(2)  # [bsz, 1, 1, seq_len]
+    indices_t = attention_mask_with_indices.unsqueeze(1).unsqueeze(3)  # [bsz, 1, seq_len, 1]
+    # Create a lower triangular mask.
+    tril_mask = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool))
+    attention_mask_4d = (indices == indices_t) & non_padding_mask & tril_mask
+    # Invert the attention mask.
+    attention_mask_4d = torch.where(attention_mask_4d, zero_tensor, min_dtype)
+    return attention_mask_4d
+
+
+@dataclass
+class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
+    r"""Data collator that supports VLMs.
+
+    Features should contain input_ids, attention_mask, labels, and optionally contain images, videos and audios.
+    """
+
+    template: Optional["Template"] = None
+    processor: Optional["ProcessorMixin"] = None
+
+    def __post_init__(self):
+        if self.template is None:
+            raise ValueError("Template is required for MultiModalDataCollator.")
+
+        if isinstance(self.model, PeftModel):
+            self.model = self.model.base_model.model
+
+        if self.model is not None and hasattr(self.model, "get_rope_index"):  # for qwen2vl mrope
+            self.get_rope_func = self.model.get_rope_index  # transformers < 4.52.0 or qwen2.5 omni
+        elif self.model is not None and hasattr(self.model, "model") and hasattr(self.model.model, "get_rope_index"):
+            self.get_rope_func = self.model.model.get_rope_index  # transformers >= 4.52.0
+        else:
+            self.get_rope_func = None
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, "torch.Tensor"]:
+        batch_images, batch_videos, batch_audios = [], [], []
+        batch_imglens, batch_vidlens, batch_audlens, batch_input_ids = [], [], [], []
+        for feature in features:
+            images = feature.pop("images", None) or []
+            videos = feature.pop("videos", None) or []
+            audios = feature.pop("audios", None) or []
+            batch_images.extend(images)
+            batch_videos.extend(videos)
+            batch_audios.extend(audios)
+            batch_imglens.append(len(images))
+            batch_vidlens.append(len(videos))
+            batch_audlens.append(len(audios))
+            batch_input_ids.append(feature["input_ids"])
+
+        fake_input_ids = []
+        if (
+            self.template.mm_plugin.image_token is not None and sum(batch_imglens) == 0 and sum(batch_vidlens) == 0
+        ):  # avoid process hanging in zero3/fsdp case
+            fake_messages = [{"role": "user", "content": IMAGE_PLACEHOLDER}]
+            fake_images = [Image.new("RGB", (64, 64), (255, 255, 255))]
+            fake_messages = self.template.mm_plugin.process_messages(
+                fake_messages, fake_images, [], [], self.processor
+            )
+            _fake_input_ids = self.tokenizer.encode(fake_messages[0]["content"], add_special_tokens=False)
+            _fake_input_ids, _ = self.template.mm_plugin.process_token_ids(
+                _fake_input_ids, None, fake_images, [], [], self.tokenizer, self.processor
+            )
+            fake_input_ids.extend(_fake_input_ids)
+            batch_images = fake_images
+            batch_imglens[0] = 1
+
+        if (
+            self.template.mm_plugin.audio_token is not None and sum(batch_audlens) == 0
+        ):  # avoid process hanging in zero3/fsdp case
+            fake_messages = [{"role": "user", "content": AUDIO_PLACEHOLDER}]
+            fake_audios = [np.zeros(1600)]
+            fake_messages = self.template.mm_plugin.process_messages(
+                fake_messages, [], [], fake_audios, self.processor
+            )
+            _fake_input_ids = self.tokenizer.encode(fake_messages[0]["content"], add_special_tokens=False)
+            _fake_input_ids, _ = self.template.mm_plugin.process_token_ids(
+                _fake_input_ids, None, [], [], fake_audios, self.tokenizer, self.processor
+            )
+            fake_input_ids.extend(_fake_input_ids)
+            batch_audios = fake_audios
+            batch_audlens[0] = 1
+
+        if len(fake_input_ids) != 0:
+            if self.tokenizer.padding_side == "right":
+                features[0]["input_ids"] = features[0]["input_ids"] + fake_input_ids
+                features[0]["attention_mask"] = features[0]["attention_mask"] + [0] * len(fake_input_ids)
+                features[0]["labels"] = features[0]["labels"] + [IGNORE_INDEX] * len(fake_input_ids)
+            else:
+                features[0]["input_ids"] = fake_input_ids + features[0]["input_ids"]
+                features[0]["attention_mask"] = [0] * len(fake_input_ids) + features[0]["attention_mask"]
+                features[0]["labels"] = [IGNORE_INDEX] * len(fake_input_ids) + features[0]["labels"]
+
+            batch_input_ids[0] = features[0]["input_ids"]
+
+        mm_inputs = self.template.mm_plugin.get_mm_inputs(
+            batch_images,
+            batch_videos,
+            batch_audios,
+            batch_imglens,
+            batch_vidlens,
+            batch_audlens,
+            batch_input_ids,
+            self.processor,
+        )
+        if "token_type_ids" in mm_inputs:
+            token_type_ids = mm_inputs.pop("token_type_ids")
+            for i, feature in enumerate(features):
+                feature["token_type_ids"] = token_type_ids[i]
+
+        features: dict[str, torch.Tensor] = super().__call__(features)
+
+        if self.get_rope_func is not None:
+            rope_index_kwargs = {
+                "input_ids": features["input_ids"],
+                "image_grid_thw": mm_inputs.get("image_grid_thw"),
+                "video_grid_thw": mm_inputs.get("video_grid_thw"),
+                "attention_mask": (features["attention_mask"] >= 1).float(),
+            }
+            if "second_per_grid_ts" in mm_inputs:  # for qwen2vl
+                rope_index_kwargs["second_per_grid_ts"] = mm_inputs.get("second_per_grid_ts")
+            elif "video_second_per_grid" in mm_inputs:  # for qwen2.5 omni
+                rope_index_kwargs["second_per_grids"] = mm_inputs.get("video_second_per_grid")
+
+            if getattr(self.model.config, "model_type", None) in ["qwen2_5_omni_thinker", "qwen3_omni_moe_thinker"]:
+                rope_index_kwargs["use_audio_in_video"] = getattr(self.processor, "use_audio_in_video", False)
+                feature_attention_mask = mm_inputs.get("feature_attention_mask", None)
+                if feature_attention_mask is not None:  # FIXME: need to get video image lengths
+                    audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
+                    rope_index_kwargs["audio_seqlens"] = audio_feature_lengths  # prepare for input
+
+                features["position_ids"], rope_deltas = self.get_rope_func(**rope_index_kwargs)
+                features["rope_deltas"] = rope_deltas - (1 - rope_index_kwargs["attention_mask"]).sum(
+                    dim=-1
+                ).unsqueeze(-1)
+            else:  # for qwen vl
+                features["position_ids"], features["rope_deltas"] = self.get_rope_func(**rope_index_kwargs)
+
+        if (
+            self.model is not None
+            and getattr(self.model.config, "model_type", None)
+            in [
+                "glm4v",
+                "Keye",
+                "qwen2_vl",
+                "qwen2_5_vl",
+                "qwen2_5_omni_thinker",
+                "qwen3_omni_moe_thinker",
+                "qwen3_vl",
+                "qwen3_vl_moe",
+            ]
+            and ("position_ids" not in features or features["position_ids"].dim() != 3)
+        ):
+            raise ValueError(f"{self.model.config.model_type} requires 3D position ids for mrope.")
+
+        if "cross_attention_mask" in mm_inputs:  # for mllama inputs when pad_to_multiple_of is enabled
+            cross_attention_mask = mm_inputs.pop("cross_attention_mask")
+            seq_len = features["input_ids"].size(1)
+            orig_len = cross_attention_mask.size(1)
+            mm_inputs["cross_attention_mask"] = F.pad(cross_attention_mask, (0, 0, 0, 0, 0, seq_len - orig_len))
+
+        features.update(mm_inputs)
+
+        if "image_bound" in features:  # for minicpmv inputs
+            bsz, seq_length = features["input_ids"].shape
+            features["position_ids"] = torch.arange(seq_length).long().repeat(bsz, 1)
+            return {"data": features, "input_ids": features["input_ids"], "labels": features["labels"]}
+
+        return features
+
+
+@dataclass
+class SFTDataCollatorWith4DAttentionMask(MultiModalDataCollatorForSeq2Seq):
+    r"""Data collator for 4d attention mask."""
+
+    block_diag_attn: bool = False
+    attn_implementation: Literal["eager", "sdpa", "flash_attention_2"] = "eager"
+    compute_dtype: "torch.dtype" = torch.float32
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, "torch.Tensor"]:
+        features = super().__call__(features)
+        if self.block_diag_attn and self.attn_implementation != "flash_attention_2":
+            features["attention_mask"] = prepare_4d_attention_mask(features["attention_mask"], self.compute_dtype)
+
+        for key, value in features.items():  # cast data dtype for paligemma
+            if torch.is_tensor(value) and torch.is_floating_point(value):
+                features[key] = value.to(self.compute_dtype)
+
+        return features
+
+
+@dataclass
+class PairwiseDataCollatorWithPadding(MultiModalDataCollatorForSeq2Seq):
+    r"""Data collator for pairwise data."""
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, "torch.Tensor"]:
+        r"""Pad batched data to the longest sequence in the batch.
+
+        We generate 2 * n examples where the first n examples represent chosen examples and
+        the last n examples represent rejected examples.
+        """
+        concatenated_features = []
+        for key in ("chosen", "rejected"):
+            for feature in features:
+                target_feature = {
+                    "input_ids": feature[f"{key}_input_ids"],
+                    "attention_mask": feature[f"{key}_attention_mask"],
+                    "labels": feature[f"{key}_labels"],
+                    "images": feature["images"],
+                    "videos": feature["videos"],
+                    "audios": feature["audios"],
+                }
+                concatenated_features.append(target_feature)
+
+        return super().__call__(concatenated_features)
+
+
+@dataclass
+class KTODataCollatorWithPadding(MultiModalDataCollatorForSeq2Seq):
+    r"""Data collator for KTO data."""
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, "torch.Tensor"]:
+        target_features = []
+        kl_features = []
+        kto_tags = []
+        for feature in features:
+            target_feature = {
+                "input_ids": feature["input_ids"],
+                "attention_mask": feature["attention_mask"],
+                "labels": feature["labels"],
+                "images": feature["images"],
+                "videos": feature["videos"],
+                "audios": feature["audios"],
+            }
+            kl_feature = {
+                "input_ids": feature["kl_input_ids"],
+                "attention_mask": feature["kl_attention_mask"],
+                "labels": feature["kl_labels"],
+                "images": feature["images"],
+                "videos": feature["videos"],
+                "audios": feature["audios"],
+            }
+            target_features.append(target_feature)
+            kl_features.append(kl_feature)
+            kto_tags.append(feature["kto_tags"])
+
+        batch = super().__call__(target_features)
+        kl_batch = super().__call__(kl_features)
+        batch["kl_input_ids"] = kl_batch["input_ids"]
+        batch["kl_attention_mask"] = kl_batch["attention_mask"]
+        batch["kl_labels"] = kl_batch["labels"]
+        if "cross_attention_mask" in kl_batch:  # for mllama inputs
+            batch["kl_cross_attention_mask"] = kl_batch["cross_attention_mask"]
+
+        if "token_type_ids" in kl_batch:
+            batch["kl_token_type_ids"] = kl_batch["token_type_ids"]
+
+        batch["kto_tags"] = torch.tensor(kto_tags)
+        return batch
diff --git a/LlamaFactory/src/llamafactory/data/converter.py b/LlamaFactory/src/llamafactory/data/converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec6f12becf8318404905cdb8070890214619bf7
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/converter.py
@@ -0,0 +1,425 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Union
+
+from ..extras import logging
+from .data_utils import Role
+
+
+if TYPE_CHECKING:
+    from datasets import Dataset, IterableDataset
+    from transformers import Seq2SeqTrainingArguments
+
+    from ..hparams import DataArguments
+    from .mm_plugin import AudioInput, ImageInput, VideoInput
+    from .parser import DatasetAttr
+
+    MediaType = Union[ImageInput, VideoInput, AudioInput]
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class DatasetConverter:
+    dataset_attr: "DatasetAttr"
+    data_args: "DataArguments"
+
+    def _find_medias(self, medias: Union["MediaType", list["MediaType"], None]) -> list["MediaType"] | None:
+        r"""Optionally concatenate media path to media dir when loading from local disk."""
+        if medias is None:
+            return None
+        elif not isinstance(medias, list):
+            medias = [medias]
+        elif len(medias) == 0:
+            return None
+        else:
+            medias = medias[:]
+
+        if self.dataset_attr.load_from in ["script", "file"]:
+            if isinstance(medias[0], str):
+                for i in range(len(medias)):
+                    media_path = os.path.join(self.data_args.media_dir, medias[i])
+                    if os.path.isfile(media_path):
+                        medias[i] = media_path
+                    else:
+                        logger.warning_rank0_once(
+                            f"Media {medias[i]} does not exist in `media_dir`. Use original path."
+                        )
+            elif isinstance(medias[0], list):  # for processed video frames
+                # medias is a list of lists, e.g., [[frame1.jpg, frame2.jpg], [frame3.jpg, frame4.jpg]]
+                for i in range(len(medias)):
+                    for j in range(len(medias[i])):
+                        media_path = os.path.join(self.data_args.media_dir, medias[i][j])
+                        if os.path.isfile(media_path):
+                            medias[i][j] = media_path
+                        else:
+                            logger.warning_rank0_once(
+                                f"Media {medias[i][j]} does not exist in `media_dir`. Use original path."
+                            )
+
+        return medias
+
+    @abstractmethod
+    def __call__(self, example: dict[str, Any]) -> dict[str, Any]:
+        r"""Convert a single example in the dataset to the standard format."""
+        ...
+
+
+@dataclass
+class AlpacaDatasetConverter(DatasetConverter):
+    def __call__(self, example: dict[str, Any]) -> dict[str, Any]:
+        prompt = []
+        if self.dataset_attr.history and isinstance(example[self.dataset_attr.history], list):
+            for old_prompt, old_response in example[self.dataset_attr.history]:
+                prompt.append({"role": Role.USER.value, "content": old_prompt})
+                prompt.append({"role": Role.ASSISTANT.value, "content": old_response})
+
+        query = []
+        if self.dataset_attr.prompt and example[self.dataset_attr.prompt]:
+            query.append(example[self.dataset_attr.prompt])
+
+        if self.dataset_attr.query and example[self.dataset_attr.query]:
+            query.append(example[self.dataset_attr.query])
+
+        prompt.append({"role": Role.USER.value, "content": "\n".join(query)})  # "prompt\nquery"
+
+        if self.dataset_attr.kto_tag and isinstance(example[self.dataset_attr.kto_tag], bool):  # kto example
+            response = [{"role": Role.ASSISTANT.value, "content": example[self.dataset_attr.response]}]
+            if example[self.dataset_attr.kto_tag]:
+                response = response + [{"role": Role.ASSISTANT.value, "content": ""}]
+            else:
+                response = [{"role": Role.ASSISTANT.value, "content": ""}] + response
+        elif (
+            self.dataset_attr.ranking
+            and isinstance(example[self.dataset_attr.chosen], str)
+            and isinstance(example[self.dataset_attr.rejected], str)
+        ):  # pairwise example
+            response = [
+                {"role": Role.ASSISTANT.value, "content": example[self.dataset_attr.chosen]},
+                {"role": Role.ASSISTANT.value, "content": example[self.dataset_attr.rejected]},
+            ]
+        elif self.dataset_attr.response and isinstance(example[self.dataset_attr.response], str):  # normal example
+            response = [{"role": Role.ASSISTANT.value, "content": example[self.dataset_attr.response]}]
+        else:  # unsupervised
+            response = []
+
+        output = {
+            "_prompt": prompt,
+            "_response": response,
+            "_system": example[self.dataset_attr.system] if self.dataset_attr.system else "",
+            "_tools": example[self.dataset_attr.tools] if self.dataset_attr.tools else "",
+            "_images": self._find_medias(example[self.dataset_attr.images]) if self.dataset_attr.images else None,
+            "_videos": self._find_medias(example[self.dataset_attr.videos]) if self.dataset_attr.videos else None,
+            "_audios": self._find_medias(example[self.dataset_attr.audios]) if self.dataset_attr.audios else None,
+        }
+        return output
+
+
+@dataclass
+class SharegptDatasetConverter(DatasetConverter):
+    def __call__(self, example: dict[str, Any]) -> dict[str, Any]:
+        tag_mapping = {
+            self.dataset_attr.user_tag: Role.USER.value,
+            self.dataset_attr.assistant_tag: Role.ASSISTANT.value,
+            self.dataset_attr.observation_tag: Role.OBSERVATION.value,
+            self.dataset_attr.function_tag: Role.FUNCTION.value,
+            self.dataset_attr.system_tag: Role.SYSTEM.value,
+        }
+        odd_tags = (self.dataset_attr.user_tag, self.dataset_attr.observation_tag)
+        even_tags = (self.dataset_attr.assistant_tag, self.dataset_attr.function_tag)
+        accept_tags = (odd_tags, even_tags)
+        messages = example[self.dataset_attr.messages]
+        if (
+            self.dataset_attr.system_tag
+            and len(messages) != 0
+            and messages[0][self.dataset_attr.role_tag] == self.dataset_attr.system_tag
+        ):
+            system = messages[0][self.dataset_attr.content_tag]
+            messages = messages[1:]
+        else:
+            system = example[self.dataset_attr.system] if self.dataset_attr.system else ""
+
+        aligned_messages = []
+        broken_data = False
+        for turn_idx, message in enumerate(messages):
+            if message[self.dataset_attr.role_tag] not in accept_tags[turn_idx % 2]:
+                logger.warning_rank0(f"Invalid role tag in {messages}.")
+                broken_data = True
+                break
+
+            aligned_messages.append(
+                {
+                    "role": tag_mapping[message[self.dataset_attr.role_tag]],
+                    "content": message[self.dataset_attr.content_tag],
+                }
+            )
+
+        if (not self.dataset_attr.ranking and len(aligned_messages) % 2 != 0) or (
+            self.dataset_attr.ranking and len(aligned_messages) % 2 == 0
+        ):
+            logger.warning_rank0(f"Invalid message count in {messages}.")
+            broken_data = True
+
+        if broken_data:
+            logger.warning_rank0("Skipping this abnormal example.")
+            prompt, response = [], []
+        elif self.dataset_attr.kto_tag and isinstance(example[self.dataset_attr.kto_tag], bool):  # kto example
+            prompt = aligned_messages[:-1]
+            response = aligned_messages[-1:]
+            if example[self.dataset_attr.kto_tag]:
+                response = response + [{"role": Role.ASSISTANT.value, "content": ""}]
+            else:
+                response = [{"role": Role.ASSISTANT.value, "content": ""}] + response
+        elif (
+            self.dataset_attr.ranking
+            and isinstance(example[self.dataset_attr.chosen], dict)
+            and isinstance(example[self.dataset_attr.rejected], dict)
+        ):  # pairwise example
+            chosen = example[self.dataset_attr.chosen]
+            rejected = example[self.dataset_attr.rejected]
+            if (
+                chosen[self.dataset_attr.role_tag] not in accept_tags[-1]
+                or rejected[self.dataset_attr.role_tag] not in accept_tags[-1]
+            ):
+                logger.warning_rank0(f"Invalid role tag in {[chosen, rejected]}.")
+                broken_data = True
+
+            prompt = aligned_messages
+            response = [
+                {
+                    "role": tag_mapping[chosen[self.dataset_attr.role_tag]],
+                    "content": chosen[self.dataset_attr.content_tag],
+                },
+                {
+                    "role": tag_mapping[rejected[self.dataset_attr.role_tag]],
+                    "content": rejected[self.dataset_attr.content_tag],
+                },
+            ]
+        else:  # normal example
+            prompt = aligned_messages[:-1]
+            response = aligned_messages[-1:]
+
+        output = {
+            "_prompt": prompt,
+            "_response": response,
+            "_system": system,
+            "_tools": example[self.dataset_attr.tools] if self.dataset_attr.tools else "",
+            "_images": self._find_medias(example[self.dataset_attr.images]) if self.dataset_attr.images else None,
+            "_videos": self._find_medias(example[self.dataset_attr.videos]) if self.dataset_attr.videos else None,
+            "_audios": self._find_medias(example[self.dataset_attr.audios]) if self.dataset_attr.audios else None,
+        }
+        return output
+
+
+@dataclass
+class OpenAIDatasetConverter(DatasetConverter):
+    def __call__(self, example: dict[str, Any]) -> dict[str, Any]:
+        tag_mapping = {
+            self.dataset_attr.user_tag: Role.USER.value,
+            self.dataset_attr.assistant_tag: Role.ASSISTANT.value,
+            self.dataset_attr.observation_tag: Role.OBSERVATION.value,
+            self.dataset_attr.function_tag: Role.FUNCTION.value,
+            self.dataset_attr.system_tag: Role.SYSTEM.value,
+        }
+
+        messages = example[self.dataset_attr.messages]
+        if (
+            self.dataset_attr.system_tag
+            and len(messages) != 0
+            and messages[0][self.dataset_attr.role_tag] == self.dataset_attr.system_tag
+        ):
+            system = messages[0][self.dataset_attr.content_tag]
+            messages = messages[1:]
+        else:
+            system = example.get(self.dataset_attr.system, "") if self.dataset_attr.system else ""
+
+        aligned_messages = []
+        tool_responses = []
+        broken_data = False
+        for turn_idx, message in enumerate(messages):
+            role = message[self.dataset_attr.role_tag]
+            content = message[self.dataset_attr.content_tag]
+
+            if role in [self.dataset_attr.assistant_tag, self.dataset_attr.function_tag]:
+                if "tool_calls" in message and len(message["tool_calls"]) > 0:
+                    tool_calls_list = [tool["function"] for tool in message["tool_calls"]]
+                    content = json.dumps(tool_calls_list, ensure_ascii=False)
+                    role = self.dataset_attr.function_tag
+
+            if role == self.dataset_attr.observation_tag:
+                tool_responses.append(content)
+                continue
+            elif len(tool_responses) > 0:
+                _content = "\n</tool_response>\n<tool_response>\n".join(tool_responses)
+                aligned_messages.append(
+                    {
+                        "role": Role.OBSERVATION.value,
+                        "content": _content,
+                    }
+                )
+                tool_responses = []
+
+            aligned_messages.append(
+                {
+                    "role": tag_mapping[role],
+                    "content": content,
+                }
+            )
+
+        odd_tags = (Role.USER.value, Role.OBSERVATION.value)
+        even_tags = (Role.ASSISTANT.value, Role.FUNCTION.value)
+        accept_tags = (odd_tags, even_tags)
+        for turn_idx, message in enumerate(aligned_messages):
+            if message["role"] not in accept_tags[turn_idx % 2]:
+                logger.warning_rank0(f"Invalid role tag in {messages}.")
+                broken_data = True
+                break
+
+        if (not self.dataset_attr.ranking and len(aligned_messages) % 2 != 0) or (
+            self.dataset_attr.ranking and len(aligned_messages) % 2 == 0
+        ):
+            logger.warning_rank0(f"Invalid message count in {messages}.")
+            broken_data = True
+
+        if broken_data:
+            logger.warning_rank0("Skipping this abnormal example.")
+            prompt, response = [], []
+        elif self.dataset_attr.kto_tag and isinstance(example[self.dataset_attr.kto_tag], bool):  # kto example
+            prompt = aligned_messages[:-1]
+            response = aligned_messages[-1:]
+            if example[self.dataset_attr.kto_tag]:
+                response = response + [{"role": Role.ASSISTANT.value, "content": ""}]
+            else:
+                response = [{"role": Role.ASSISTANT.value, "content": ""}] + response
+        elif (
+            self.dataset_attr.ranking
+            and isinstance(example[self.dataset_attr.chosen], dict)
+            and isinstance(example[self.dataset_attr.rejected], dict)
+        ):  # pairwise example
+            chosen = example[self.dataset_attr.chosen]
+            rejected = example[self.dataset_attr.rejected]
+            if (
+                chosen[self.dataset_attr.role_tag] not in accept_tags[-1]
+                or rejected[self.dataset_attr.role_tag] not in accept_tags[-1]
+            ):
+                logger.warning_rank0(f"Invalid role tag in {[chosen, rejected]}.")
+                broken_data = True
+
+            prompt = aligned_messages
+            response = [
+                {
+                    "role": tag_mapping[chosen[self.dataset_attr.role_tag]],
+                    "content": chosen[self.dataset_attr.content_tag],
+                },
+                {
+                    "role": tag_mapping[rejected[self.dataset_attr.role_tag]],
+                    "content": rejected[self.dataset_attr.content_tag],
+                },
+            ]
+        else:  # normal example
+            prompt = aligned_messages[:-1]
+            response = aligned_messages[-1:]
+
+        tools = example.get(self.dataset_attr.tools, "") if self.dataset_attr.tools else ""
+        if isinstance(tools, dict) or isinstance(tools, list):
+            tools = json.dumps(tools, ensure_ascii=False)
+
+        short_system_prompt = "detailed thinking off"
+        if not system:
+            if not tools:
+                system = short_system_prompt
+            else:
+                pass
+        else:
+            if not tools:
+                if "detailed thinking on" in system or "detailed thinking off" in system:
+                    pass
+                else:
+                    system += "\n" + short_system_prompt
+            else:
+                system += "\n"
+
+        output = {
+            "_prompt": prompt,
+            "_response": response,
+            "_system": system,
+            "_tools": tools,
+            "_images": self._find_medias(example[self.dataset_attr.images]) if self.dataset_attr.images else None,
+            "_videos": self._find_medias(example[self.dataset_attr.videos]) if self.dataset_attr.videos else None,
+            "_audios": self._find_medias(example[self.dataset_attr.audios]) if self.dataset_attr.audios else None,
+        }
+        return output
+
+
+DATASET_CONVERTERS = {
+    "alpaca": AlpacaDatasetConverter,
+    "sharegpt": SharegptDatasetConverter,
+    "openai": OpenAIDatasetConverter,
+}
+
+
+def register_dataset_converter(name: str, dataset_converter: type["DatasetConverter"]) -> None:
+    r"""Register a new dataset converter."""
+    if name in DATASET_CONVERTERS:
+        raise ValueError(f"Dataset converter {name} already exists.")
+
+    DATASET_CONVERTERS[name] = dataset_converter
+
+
+def get_dataset_converter(name: str, dataset_attr: "DatasetAttr", data_args: "DataArguments") -> "DatasetConverter":
+    r"""Get a dataset converter."""
+    if name not in DATASET_CONVERTERS:
+        raise ValueError(f"Dataset converter {name} not found.")
+
+    return DATASET_CONVERTERS[name](dataset_attr, data_args)
+
+
+def align_dataset(
+    dataset: Union["Dataset", "IterableDataset"],
+    dataset_attr: "DatasetAttr",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+) -> Union["Dataset", "IterableDataset"]:
+    r"""Align the dataset to a specific format.
+
+    Aligned dataset:
+    _prompt: [{"role": "user", "content": "..."}] * (2T - 1)
+    _response: [{"role": "assistant", "content": "..."}] * N (N > 1 for ranking dataset)
+    _system: "..."
+    _tools: "..."
+    _images: []
+    _videos: []
+    _audios: []
+    """
+    column_names = list(next(iter(dataset)).keys())
+    kwargs = {}
+    if not data_args.streaming:
+        kwargs = dict(
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
+            desc="Converting format of dataset",
+        )
+
+    dataset_converter = get_dataset_converter(dataset_attr.formatting, dataset_attr, data_args)
+    return dataset.map(
+        dataset_converter,
+        batched=False,
+        remove_columns=column_names,
+        **kwargs,
+    )
diff --git a/LlamaFactory/src/llamafactory/data/data_utils.py b/LlamaFactory/src/llamafactory/data/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fcd9554f9b6071ef12726f026ed2a1d8b32507c
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/data_utils.py
@@ -0,0 +1,203 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from enum import Enum, unique
+from typing import TYPE_CHECKING, Any, Optional, TypedDict, Union
+
+import fsspec
+from datasets import DatasetDict, concatenate_datasets, interleave_datasets
+
+from ..extras import logging
+
+
+if TYPE_CHECKING:
+    from datasets import Dataset, IterableDataset
+
+    from ..hparams import DataArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+SLOTS = list[Union[str, set[str], dict[str, str]]]
+
+
+@unique
+class Role(str, Enum):
+    USER = "user"
+    ASSISTANT = "assistant"
+    SYSTEM = "system"
+    FUNCTION = "function"
+    OBSERVATION = "observation"
+
+
+class DatasetModule(TypedDict):
+    train_dataset: Optional[Union["Dataset", "IterableDataset"]]
+    eval_dataset: Optional[Union["Dataset", "IterableDataset", dict[str, "Dataset"]]]
+
+
+def merge_dataset(
+    all_datasets: list[Union["Dataset", "IterableDataset"]], data_args: "DataArguments", seed: int
+) -> Union["Dataset", "IterableDataset"]:
+    r"""Merge multiple datasets to a unified dataset."""
+    if len(all_datasets) == 1:
+        return all_datasets[0]
+
+    elif data_args.mix_strategy == "concat":
+        if data_args.streaming:
+            logger.warning_rank0_once("The samples between different datasets will not be mixed in streaming mode.")
+
+        return concatenate_datasets(all_datasets)
+
+    elif data_args.mix_strategy.startswith("interleave"):
+        if not data_args.streaming:
+            logger.warning_rank0_once("We recommend using `mix_strategy=concat` in non-streaming mode.")
+
+        strategy_map: str = {
+            "interleave_under": "first_exhausted",
+            "interleave_over": "all_exhausted",
+            "interleave_once": "all_exhausted_without_replacement",
+        }[data_args.mix_strategy]
+
+        return interleave_datasets(
+            datasets=all_datasets,
+            probabilities=data_args.interleave_probs,
+            seed=seed,
+            stopping_strategy=strategy_map,  # type: ignore
+        )
+
+    else:
+        raise ValueError(f"Unknown mixing strategy: {data_args.mix_strategy}.")
+
+
+def split_dataset(
+    dataset: Optional[Union["Dataset", "IterableDataset"]],
+    eval_dataset: Optional[Union["Dataset", "IterableDataset", dict[str, "Dataset"]]],
+    data_args: "DataArguments",
+    seed: int,
+) -> tuple[dict, dict]:
+    r"""Split the dataset and returns two dicts containing train set and validation set.
+
+    Support both map dataset and iterable dataset.
+
+    Returns:
+        train_dict: Dictionary containing training data with key "train"
+        eval_dict: Dictionary containing evaluation data with keys "validation" or "validation_{name}"
+    """
+    if eval_dataset is not None and data_args.val_size > 1e-6:
+        raise ValueError("Cannot specify `val_size` if `eval_dataset` is not None.")
+
+    # the train and eval better to in dict dtype and separately return for cpode clearly and good handle outside
+    train_dict, eval_dict = {}, {}
+
+    if dataset is not None:
+        if data_args.streaming:
+            dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=seed)
+
+        if data_args.val_size > 1e-6:
+            if data_args.streaming:
+                eval_dict["validation"] = dataset.take(int(data_args.val_size))
+                train_dict["train"] = dataset.skip(int(data_args.val_size))
+            else:
+                val_size = int(data_args.val_size) if data_args.val_size > 1 else data_args.val_size
+                split_result = dataset.train_test_split(test_size=val_size, seed=seed)
+                train_dict["train"] = split_result["train"]
+                eval_dict["validation"] = split_result["test"]
+        else:
+            train_dict["train"] = dataset
+
+    if eval_dataset is not None:
+        if isinstance(eval_dataset, dict):
+            for name, data in eval_dataset.items():
+                eval_dict[f"validation_{name}"] = data
+        else:
+            if data_args.streaming:
+                eval_dataset = eval_dataset.shuffle(buffer_size=data_args.buffer_size, seed=seed)
+
+            eval_dict["validation"] = eval_dataset
+
+    return train_dict, eval_dict
+
+
+def get_dataset_module(dataset: Union["Dataset", "DatasetDict"]) -> "DatasetModule":
+    r"""Convert dataset or dataset dict to dataset module."""
+    dataset_module: DatasetModule = {}
+    if isinstance(dataset, DatasetDict):  # dataset dict
+        if "train" in dataset:
+            dataset_module["train_dataset"] = dataset["train"]
+
+        if "validation" in dataset:
+            dataset_module["eval_dataset"] = dataset["validation"]
+        else:
+            eval_dataset = {}
+            for key in dataset.keys():
+                if key.startswith("validation_"):
+                    eval_dataset[key[len("validation_") :]] = dataset[key]
+
+            if len(eval_dataset):
+                dataset_module["eval_dataset"] = eval_dataset
+
+    else:  # single dataset
+        dataset_module["train_dataset"] = dataset
+
+    return dataset_module
+
+
+def setup_fs(path: str, anon: bool = False) -> "fsspec.AbstractFileSystem":
+    r"""Set up a filesystem object based on the path protocol."""
+    storage_options = {"anon": anon} if anon else {}
+    if path.startswith("s3://"):
+        fs = fsspec.filesystem("s3", **storage_options)
+    elif path.startswith(("gs://", "gcs://")):
+        fs = fsspec.filesystem("gcs", **storage_options)
+    else:
+        raise ValueError(f"Unsupported protocol in path: {path}. Use 's3://' or 'gs://'.")
+
+    if not fs.exists(path):
+        raise ValueError(f"Path does not exist: {path}.")
+
+    return fs
+
+
+def _read_json_with_fs(fs: "fsspec.AbstractFileSystem", path: str) -> list[Any]:
+    r"""Helper function to read JSON/JSONL files using fsspec."""
+    with fs.open(path, "r") as f:
+        if path.endswith(".jsonl"):
+            return [json.loads(line) for line in f if line.strip()]
+        else:
+            return json.load(f)
+
+
+def read_cloud_json(cloud_path: str) -> list[Any]:
+    r"""Read a JSON/JSONL file from cloud storage (S3 or GCS).
+
+    Args:
+        cloud_path: str
+            Cloud path in the format:
+            - 's3://bucket-name/file.json' for AWS S3
+            - 'gs://bucket-name/file.jsonl' or 'gcs://bucket-name/file.jsonl' for Google Cloud Storage
+    """
+    try:
+        fs = setup_fs(cloud_path, anon=True)  # try with anonymous access first
+    except Exception:
+        fs = setup_fs(cloud_path)  # try again with credentials
+
+    # filter out non-JSON files
+    files = [x["Key"] for x in fs.listdir(cloud_path)] if fs.isdir(cloud_path) else [cloud_path]
+    files = filter(lambda file: file.endswith(".json") or file.endswith(".jsonl"), files)
+    if not files:
+        raise ValueError(f"No JSON/JSONL files found in the specified path: {cloud_path}.")
+
+    return sum([_read_json_with_fs(fs, file) for file in files], [])
diff --git a/LlamaFactory/src/llamafactory/data/formatter.py b/LlamaFactory/src/llamafactory/data/formatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c080f8812d0d9feb320e949d39bfcb1a0e1e582
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/formatter.py
@@ -0,0 +1,159 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+
+from typing_extensions import override
+
+from .data_utils import SLOTS
+from .tool_utils import FunctionCall, get_tool_utils
+
+
+@dataclass
+class Formatter(ABC):
+    slots: SLOTS = field(default_factory=list)
+    tool_format: str | None = None
+
+    @abstractmethod
+    def apply(self, **kwargs) -> SLOTS:
+        r"""Forms a list of slots according to the inputs to encode."""
+        ...
+
+    def extract(self, content: str) -> str | list["FunctionCall"]:
+        r"""Extract a list of tuples from the response message if using tools.
+
+        Each tuple consists of function name and function arguments.
+        """
+        raise NotImplementedError
+
+
+@dataclass
+class EmptyFormatter(Formatter):
+    def __post_init__(self):
+        has_placeholder = False
+        for slot in filter(lambda s: isinstance(s, str), self.slots):
+            if re.search(r"\{\{[a-zA-Z_][a-zA-Z0-9_]*\}\}", slot):
+                has_placeholder = True
+
+        if has_placeholder:
+            raise ValueError("Empty formatter should not contain any placeholder.")
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        return self.slots
+
+
+@dataclass
+class StringFormatter(Formatter):
+    def __post_init__(self):
+        has_placeholder = False
+        for slot in filter(lambda s: isinstance(s, str), self.slots):
+            if re.search(r"\{\{[a-zA-Z_][a-zA-Z0-9_]*\}\}", slot):
+                has_placeholder = True
+
+        if not has_placeholder:
+            raise ValueError("A placeholder is required in the string formatter.")
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        elements = []
+        for slot in self.slots:
+            if isinstance(slot, str):
+                for name, value in kwargs.items():
+                    if not isinstance(value, str):
+                        raise RuntimeError(f"Expected a string, got {value}")
+
+                    slot = slot.replace("{{" + name + "}}", value, 1)
+                elements.append(slot)
+            elif isinstance(slot, (dict, set)):
+                elements.append(slot)
+            else:
+                raise RuntimeError(f"Input must be string, set[str] or dict[str, str], got {type(slot)}.")
+
+        return elements
+
+
+@dataclass
+class FunctionFormatter(StringFormatter):
+    def __post_init__(self):
+        super().__post_init__()
+        self.tool_utils = get_tool_utils(self.tool_format)
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        content: str = kwargs.pop("content")
+        thought_words = kwargs.pop("thought_words", None)
+        tool_call_words = kwargs.pop("tool_call_words", None)
+
+        def _parse_functions(json_content: str) -> list["FunctionCall"]:
+            try:
+                tool_calls = json.loads(json_content)
+                if not isinstance(tool_calls, list):  # parallel function call
+                    tool_calls = [tool_calls]
+
+                return [FunctionCall(tc["name"], json.dumps(tc["arguments"], ensure_ascii=False)) for tc in tool_calls]
+            except json.JSONDecodeError:
+                raise RuntimeError(f"Invalid JSON format in function message: {str([content])}.")
+
+        tool_call_match = None
+        if tool_call_words and len(tool_call_words) == 2:
+            tool_call_regex = re.compile(
+                rf"{re.escape(tool_call_words[0])}(.*?){re.escape(tool_call_words[1])}", re.DOTALL
+            )
+            tool_call_match = re.search(tool_call_regex, content)
+
+        if tool_call_match is None:
+            thought_match = None
+            if thought_words and len(thought_words) == 2:
+                regex = re.compile(rf"{re.escape(thought_words[0])}(.*?){re.escape(thought_words[1])}", re.DOTALL)
+                thought_match = re.search(regex, content)
+
+            if thought_match:
+                json_part = content.replace(thought_match.group(0), "")
+            else:
+                json_part = content
+
+            functions = _parse_functions(json_part)
+            function_str = self.tool_utils.function_formatter(functions)
+            if thought_match:
+                function_str = thought_match.group(0) + function_str
+        else:
+            thought_content = content.replace(tool_call_match.group(0), "")
+            functions = _parse_functions(tool_call_match.group(1))
+            function_str = self.tool_utils.function_formatter(functions)
+            function_str = thought_content + function_str
+
+        return super().apply(content=function_str)
+
+
+@dataclass
+class ToolFormatter(Formatter):
+    def __post_init__(self):
+        self.tool_utils = get_tool_utils(self.tool_format)
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        content = kwargs.pop("content")
+        try:
+            tools = json.loads(content)
+            return [self.tool_utils.tool_formatter(tools) if len(tools) != 0 else ""]
+        except json.JSONDecodeError:
+            raise RuntimeError(f"Invalid JSON format in tool description: {str([content])}.")  # flat string
+
+    @override
+    def extract(self, content: str) -> str | list["FunctionCall"]:
+        return self.tool_utils.tool_extractor(content)
diff --git a/LlamaFactory/src/llamafactory/data/loader.py b/LlamaFactory/src/llamafactory/data/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3d44e6f3c25de0fa46c333be6a3a39d5f1dcfee
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/loader.py
@@ -0,0 +1,336 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import TYPE_CHECKING, Literal, Optional, Union
+
+import numpy as np
+from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
+
+from ..extras import logging
+from ..extras.constants import FILEEXT2TYPE
+from ..extras.misc import check_version, has_tokenized_data
+from .converter import align_dataset
+from .data_utils import get_dataset_module, merge_dataset, read_cloud_json, split_dataset
+from .parser import get_dataset_list
+from .processor import (
+    FeedbackDatasetProcessor,
+    PackedSupervisedDatasetProcessor,
+    PairwiseDatasetProcessor,
+    PretrainDatasetProcessor,
+    SupervisedDatasetProcessor,
+    UnsupervisedDatasetProcessor,
+)
+
+
+if TYPE_CHECKING:
+    from datasets import Dataset, IterableDataset
+    from transformers import PreTrainedTokenizer, ProcessorMixin, Seq2SeqTrainingArguments
+
+    from ..hparams import DataArguments, ModelArguments
+    from .data_utils import DatasetModule
+    from .parser import DatasetAttr
+    from .processor import DatasetProcessor
+    from .template import Template
+
+
+logger = logging.get_logger(__name__)
+
+
+def _load_single_dataset(
+    dataset_attr: "DatasetAttr",
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+) -> Union["Dataset", "IterableDataset"]:
+    r"""Load a single dataset and aligns it to the standard format."""
+    logger.info_rank0(f"Loading dataset {dataset_attr}...")
+    data_path, data_name, data_dir, data_files = None, None, None, None
+    if dataset_attr.load_from in ["hf_hub", "ms_hub", "om_hub"]:
+        data_path = dataset_attr.dataset_name
+        data_name = dataset_attr.subset
+        data_dir = dataset_attr.folder
+
+    elif dataset_attr.load_from == "script":
+        data_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
+        data_name = dataset_attr.subset
+        data_dir = dataset_attr.folder
+
+    elif dataset_attr.load_from == "cloud_file":
+        data_path = dataset_attr.dataset_name
+
+    elif dataset_attr.load_from == "file":
+        data_files = []
+        local_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
+        if os.path.isdir(local_path):  # is directory
+            for file_name in os.listdir(local_path):
+                data_files.append(os.path.join(local_path, file_name))
+        elif os.path.isfile(local_path):  # is file
+            data_files.append(local_path)
+        else:
+            raise ValueError(f"File {local_path} not found.")
+
+        data_path = FILEEXT2TYPE.get(os.path.splitext(data_files[0])[-1][1:], None)
+        if data_path is None:
+            raise ValueError("Allowed file types: {}.".format(",".join(FILEEXT2TYPE.keys())))
+
+        if any(data_path != FILEEXT2TYPE.get(os.path.splitext(data_file)[-1][1:], None) for data_file in data_files):
+            raise ValueError("File types should be identical.")
+    else:
+        raise NotImplementedError(f"Unknown load type: {dataset_attr.load_from}.")
+
+    if dataset_attr.load_from == "ms_hub":
+        check_version("modelscope>=1.14.0", mandatory=True)
+        from modelscope import MsDataset  # type: ignore
+        from modelscope.utils.config_ds import MS_DATASETS_CACHE  # type: ignore
+
+        cache_dir = model_args.cache_dir or MS_DATASETS_CACHE
+        dataset = MsDataset.load(
+            dataset_name=data_path,
+            subset_name=data_name,
+            data_dir=data_dir,
+            data_files=data_files,
+            split=dataset_attr.split,
+            cache_dir=cache_dir,
+            token=model_args.ms_hub_token,
+            use_streaming=data_args.streaming,
+        )
+        if isinstance(dataset, MsDataset):
+            dataset = dataset.to_hf_dataset()
+
+    elif dataset_attr.load_from == "om_hub":
+        check_version("openmind>=0.8.0", mandatory=True)
+        from openmind import OmDataset  # type: ignore
+        from openmind.utils.hub import OM_DATASETS_CACHE  # type: ignore
+
+        cache_dir = model_args.cache_dir or OM_DATASETS_CACHE
+        dataset = OmDataset.load_dataset(
+            path=data_path,
+            name=data_name,
+            data_dir=data_dir,
+            data_files=data_files,
+            split=dataset_attr.split,
+            cache_dir=cache_dir,
+            token=model_args.om_hub_token,
+            streaming=data_args.streaming,
+        )
+    elif dataset_attr.load_from == "cloud_file":
+        dataset = Dataset.from_list(read_cloud_json(data_path), split=dataset_attr.split)
+    else:
+        dataset = load_dataset(
+            path=data_path,
+            name=data_name,
+            data_dir=data_dir,
+            data_files=data_files,
+            split=dataset_attr.split,
+            cache_dir=model_args.cache_dir,
+            token=model_args.hf_hub_token,
+            num_proc=data_args.preprocessing_num_workers,
+            streaming=data_args.streaming and dataset_attr.load_from != "file",
+        )
+        if data_args.streaming and dataset_attr.load_from == "file":
+            dataset = dataset.to_iterable_dataset(num_shards=training_args.dataloader_num_workers)
+
+    if dataset_attr.num_samples is not None and not data_args.streaming:
+        target_num = dataset_attr.num_samples
+        indexes = np.random.permutation(len(dataset))[:target_num]  # all samples should be included
+        target_num -= len(indexes)
+        if target_num > 0:
+            expand_indexes = np.random.choice(len(dataset), target_num)
+            indexes = np.concatenate((indexes, expand_indexes), axis=0)
+
+        assert len(indexes) == dataset_attr.num_samples, "Sample num mismatched."
+        dataset = dataset.select(indexes)
+        logger.info_rank0(f"Sampled {dataset_attr.num_samples} examples from dataset {dataset_attr}.")
+
+    if data_args.max_samples is not None:  # truncate dataset
+        max_samples = min(data_args.max_samples, len(dataset))
+        dataset = dataset.select(range(max_samples))
+
+    return align_dataset(dataset, dataset_attr, data_args, training_args)
+
+
+def _get_merged_dataset(
+    dataset_names: list[str] | None,
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+    return_dict: bool = False,
+) -> Union["Dataset", "IterableDataset", dict[str, "Dataset"]] | None:
+    r"""Return the merged datasets in the standard format."""
+    if dataset_names is None:
+        return None
+
+    datasets = {}
+    for dataset_name, dataset_attr in zip(dataset_names, get_dataset_list(dataset_names, data_args.dataset_dir)):
+        if (stage == "rm" and dataset_attr.ranking is False) or (stage != "rm" and dataset_attr.ranking is True):
+            raise ValueError("The dataset is not applicable in the current training stage.")
+
+        datasets[dataset_name] = _load_single_dataset(dataset_attr, model_args, data_args, training_args)
+
+    if return_dict:
+        return datasets
+    else:
+        return merge_dataset(list(datasets.values()), data_args, seed=training_args.seed)
+
+
+def _get_dataset_processor(
+    data_args: "DataArguments",
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    do_generate: bool = False,
+) -> "DatasetProcessor":
+    r"""Return the corresponding dataset processor."""
+    if stage == "pt":
+        dataset_processor_class = PretrainDatasetProcessor
+    elif stage == "sft" and not do_generate:
+        if data_args.packing:
+            if data_args.neat_packing:  # hack datasets to have int32 attention mask
+                from datasets.arrow_writer import OptimizedTypedSequence, TypedSequence
+
+                def __init__(self, data, **kwargs):
+                    return TypedSequence.__init__(
+                        self,
+                        data,
+                        type=kwargs.pop("type", None),
+                        try_type=kwargs.pop("try_type", None),
+                        optimized_int_type=kwargs.pop("optimized_int_type", None),
+                    )
+
+                OptimizedTypedSequence.__init__ = __init__
+            dataset_processor_class = PackedSupervisedDatasetProcessor
+        else:
+            dataset_processor_class = SupervisedDatasetProcessor
+
+    elif stage == "rm":
+        dataset_processor_class = PairwiseDatasetProcessor
+    elif stage == "kto":
+        dataset_processor_class = FeedbackDatasetProcessor
+    else:
+        dataset_processor_class = UnsupervisedDatasetProcessor
+
+    return dataset_processor_class(template=template, tokenizer=tokenizer, processor=processor, data_args=data_args)
+
+
+def _get_preprocessed_dataset(
+    dataset: Union["Dataset", "IterableDataset"] | None,
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"] = None,
+    is_eval: bool = False,
+) -> Union["Dataset", "IterableDataset"] | None:
+    r"""Preprocesses the dataset, including format checking and tokenization."""
+    if dataset is None:
+        return None
+
+    dataset_processor = _get_dataset_processor(
+        data_args, stage, template, tokenizer, processor, do_generate=(training_args.predict_with_generate and is_eval)
+    )
+    column_names = list(next(iter(dataset)).keys())
+    kwargs = {}
+    if not data_args.streaming:
+        kwargs = dict(
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
+            desc="Running tokenizer on dataset",
+        )
+
+    dataset = dataset.map(
+        dataset_processor.preprocess_dataset,
+        batched=True,
+        batch_size=data_args.preprocessing_batch_size,
+        remove_columns=column_names,
+        **kwargs,
+    )
+
+    if training_args.should_log:
+        try:
+            print("eval example:" if is_eval else "training example:")
+            dataset_processor.print_data_example(next(iter(dataset)))
+        except StopIteration:
+            if stage == "pt":
+                raise RuntimeError("Cannot find sufficient samples, consider increasing dataset size.")
+            else:
+                raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.")
+
+    return dataset
+
+
+def get_dataset(
+    template: "Template",
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"] = None,
+) -> "DatasetModule":
+    r"""Get the train dataset and optionally gets the evaluation dataset."""
+    # Load tokenized dataset if path exists
+    if data_args.tokenized_path is not None:
+        if has_tokenized_data(data_args.tokenized_path):
+            logger.warning_rank0("Loading dataset from disk will ignore other data arguments.")
+            tokenized_data = load_from_disk(data_args.tokenized_path)
+            dataset_module = get_dataset_module(tokenized_data)
+            if data_args.streaming:
+                dataset_module["train_dataset"] = dataset_module["train_dataset"].to_iterable_dataset()
+
+            logger.info_rank0(f"Loaded tokenized dataset from {data_args.tokenized_path}.")
+            return dataset_module
+
+        if data_args.streaming:
+            raise ValueError("Turn off `streaming` when saving dataset to disk.")
+
+    # Load and preprocess dataset
+    with training_args.main_process_first(desc="load dataset", local=(not data_args.data_shared_file_system)):
+        dataset = _get_merged_dataset(data_args.dataset, model_args, data_args, training_args, stage)
+        eval_dataset = _get_merged_dataset(
+            data_args.eval_dataset,
+            model_args,
+            data_args,
+            training_args,
+            stage,
+            return_dict=data_args.eval_on_each_dataset,
+        )
+
+    with training_args.main_process_first(desc="pre-process dataset", local=(not data_args.data_shared_file_system)):
+        # move front to make sure eval_dataset(if contain or split) can preprocessed appropriately
+        train_dict, eval_dict = split_dataset(dataset, eval_dataset, data_args, seed=training_args.seed)
+
+        if "train" in train_dict:
+            train_dict["train"] = _get_preprocessed_dataset(
+                train_dict["train"], data_args, training_args, stage, template, tokenizer, processor, is_eval=False
+            )
+
+        for key in eval_dict:
+            eval_dict[key] = _get_preprocessed_dataset(
+                eval_dict[key], data_args, training_args, stage, template, tokenizer, processor, is_eval=True
+            )
+
+        # Combine train and eval dictionaries
+        dataset_dict = DatasetDict({**train_dict, **eval_dict})
+
+        if data_args.tokenized_path is not None:  # save tokenized dataset to disk
+            if training_args.should_save:
+                dataset_dict.save_to_disk(data_args.tokenized_path)
+                logger.info_rank0(f"Tokenized dataset is saved at {data_args.tokenized_path}.")
+                logger.info_rank0(f"Please launch the training with `tokenized_path: {data_args.tokenized_path}`.")
+
+        return get_dataset_module(dataset_dict)
diff --git a/LlamaFactory/src/llamafactory/data/mm_plugin.py b/LlamaFactory/src/llamafactory/data/mm_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..fca092cb8747b0ca709419970304859f8383e8d2
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/mm_plugin.py
@@ -0,0 +1,2241 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's Transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava/processing_llava.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+import os
+import re
+from copy import deepcopy
+from dataclasses import dataclass
+from io import BytesIO
+from typing import TYPE_CHECKING, BinaryIO, Literal, NotRequired, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torchaudio
+from transformers.image_utils import get_image_size, is_valid_image, to_numpy_array
+from transformers.models.mllama.processing_mllama import (
+    convert_sparse_cross_attention_mask_to_dense,
+    get_cross_attention_token_mask,
+)
+from typing_extensions import override
+
+from ..extras.constants import AUDIO_PLACEHOLDER, IGNORE_INDEX, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER
+from ..extras.packages import is_pillow_available, is_pyav_available, is_transformers_version_greater_than
+
+
+if is_pillow_available():
+    from PIL import Image
+    from PIL.Image import Image as ImageObject
+
+
+if is_pyav_available():
+    import av
+
+
+if is_transformers_version_greater_than("4.52.0"):
+    from transformers.image_utils import make_flat_list_of_images
+    from transformers.video_utils import make_batched_videos
+else:
+    from transformers.image_utils import make_batched_videos, make_flat_list_of_images
+
+
+if TYPE_CHECKING:
+    from av.stream import Stream
+    from numpy.typing import NDArray
+    from transformers import PreTrainedTokenizer, ProcessorMixin
+    from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
+    from transformers.image_processing_utils import BaseImageProcessor
+    from transformers.video_processing_utils import BaseVideoProcessor
+
+    class EncodedImage(TypedDict):
+        path: str | None
+        bytes: bytes | None
+
+    ImageInput = Union[str, bytes, EncodedImage, BinaryIO, ImageObject]
+    VideoInput = Union[str, BinaryIO, list[list[ImageInput]]]
+    AudioInput = Union[str, BinaryIO, NDArray]
+
+    class RegularizedImageOutput(TypedDict):
+        images: list[ImageObject]
+
+    class RegularizedVideoOutput(TypedDict):
+        videos: list[list[ImageObject]]
+        durations: list[float]
+        fps_per_video: NotRequired[list[float]]
+
+    class RegularizedAudioOutput(TypedDict):
+        audios: list[NDArray]
+        sampling_rates: list[float]
+
+    class MMProcessor(ProcessorMixin):
+        patch_size: int
+        image_seq_length: int
+        num_additional_image_tokens: int
+        vision_feature_select_strategy: Literal["default", "full"]
+
+        def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
+            pass
+
+
+def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]:
+    r"""Get paligemma token type ids for computing loss.
+
+    It is slightly different with the original token type ids where the prompt part is 0.
+
+    Returns:
+        batch_token_type_ids: shape (batch_size, seq_length)
+
+    """
+    batch_token_type_ids = []
+    for imglen, seqlen in zip(imglens, seqlens):
+        image_seqlen = imglen * processor.image_seq_length
+        batch_token_type_ids.append([0] * image_seqlen + [1] * (seqlen - image_seqlen))
+
+    return batch_token_type_ids
+
+
+def _get_gemma3_token_type_ids(batch_ids: list[list[int]], processor: "MMProcessor"):
+    r"""Get gemma3 token type ids for computing loss.
+
+    Returns:
+        batch_token_type_ids: shape (batch_size, seq_length)
+
+    """
+    image_token_id: int = getattr(processor, "image_token_id")
+    batch_token_type_ids = []
+    for token_ids in batch_ids:
+        token_ids = np.array(token_ids)
+        token_type_ids = np.zeros_like(token_ids)
+        token_type_ids[token_ids == image_token_id] = 1
+        batch_token_type_ids.append(token_type_ids.tolist())
+
+    return batch_token_type_ids
+
+
+def _make_batched_images(images: list["ImageObject"], imglens: list[int]) -> list[list["ImageObject"]]:
+    r"""Make nested list of images."""
+    batch_images = []
+    for imglen in imglens:
+        batch_images.append(images[:imglen])
+        images = images[imglen:]
+
+    return batch_images
+
+
+def _check_video_is_nested_images(video: "VideoInput") -> bool:
+    r"""Check if the video is nested images."""
+    return isinstance(video, list) and all(isinstance(frame, (str, BinaryIO, dict, ImageObject)) for frame in video)
+
+
+@dataclass
+class MMPluginMixin:
+    image_token: str | None
+    video_token: str | None
+    audio_token: str | None
+    expand_mm_tokens: bool = True
+
+    def _validate_input(
+        self,
+        processor: Optional["MMProcessor"],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+    ) -> None:
+        r"""Validate if this model accepts the input modalities."""
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+        video_processor: BaseImageProcessor = getattr(
+            processor, "video_processor", getattr(processor, "image_processor", None)
+        )
+        feature_extractor: SequenceFeatureExtractor = getattr(processor, "feature_extractor", None)
+        if len(images) != 0 and self.image_token is None:
+            raise ValueError(
+                "This model does not support image input. Please check whether the correct `template` is used."
+            )
+
+        if len(videos) != 0 and self.video_token is None:
+            raise ValueError(
+                "This model does not support video input. Please check whether the correct `template` is used."
+            )
+
+        if len(audios) != 0 and self.audio_token is None:
+            raise ValueError(
+                "This model does not support audio input. Please check whether the correct `template` is used."
+            )
+
+        if self.image_token is not None and processor is None:
+            raise ValueError("Processor was not found, please check and update your model file.")
+
+        if self.image_token is not None and image_processor is None:
+            raise ValueError("Image processor was not found, please check and update your model file.")
+
+        if self.video_token is not None and video_processor is None:
+            raise ValueError("Video processor was not found, please check and update your model file.")
+
+        if self.audio_token is not None and feature_extractor is None:
+            raise ValueError("Audio feature extractor was not found, please check and update your model file.")
+
+    def _validate_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+    ):
+        r"""Validate if the number of images, videos and audios match the number of placeholders in messages."""
+        num_image_tokens, num_video_tokens, num_audio_tokens = 0, 0, 0
+        for message in messages:
+            num_image_tokens += message["content"].count(IMAGE_PLACEHOLDER)
+            num_video_tokens += message["content"].count(VIDEO_PLACEHOLDER)
+            num_audio_tokens += message["content"].count(AUDIO_PLACEHOLDER)
+
+        if len(images) != num_image_tokens:
+            raise ValueError(
+                f"The number of images does not match the number of {IMAGE_PLACEHOLDER} tokens in {messages}."
+            )
+
+        if len(videos) != num_video_tokens:
+            raise ValueError(
+                f"The number of videos does not match the number of {VIDEO_PLACEHOLDER} tokens in {messages}."
+            )
+
+        if len(audios) != num_audio_tokens:
+            raise ValueError(
+                f"The number of audios does not match the number of {AUDIO_PLACEHOLDER} tokens in {messages}."
+            )
+
+    def _preprocess_image(
+        self, image: "ImageObject", image_max_pixels: int, image_min_pixels: int, **kwargs
+    ) -> "ImageObject":
+        r"""Pre-process a single image."""
+        if (image.width * image.height) > image_max_pixels:
+            resize_factor = math.sqrt(image_max_pixels / (image.width * image.height))
+            width, height = int(image.width * resize_factor), int(image.height * resize_factor)
+            image = image.resize((width, height))
+
+        if (image.width * image.height) < image_min_pixels:
+            resize_factor = math.sqrt(image_min_pixels / (image.width * image.height))
+            width, height = int(image.width * resize_factor), int(image.height * resize_factor)
+            image = image.resize((width, height))
+
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+
+        return image
+
+    def _get_video_sample_indices(
+        self, video_stream: "Stream", video_fps: float, video_maxlen: int, **kwargs
+    ) -> list[int]:
+        r"""Compute video sample indices according to fps."""
+        total_frames = video_stream.frames
+        if total_frames == 0:  # infinite video
+            return np.linspace(0, video_maxlen - 1, video_maxlen).astype(np.int32)
+
+        sample_frames = max(1, math.floor(float(video_stream.duration * video_stream.time_base) * video_fps))
+        sample_frames = min(total_frames, video_maxlen, sample_frames)
+        return np.linspace(0, total_frames - 1, sample_frames).astype(np.int32)
+
+    def _regularize_images(self, images: list["ImageInput"], **kwargs) -> "RegularizedImageOutput":
+        r"""Regularize images to avoid error. Including reading and pre-processing."""
+        results = []
+        for image in images:
+            if isinstance(image, (str, BinaryIO)):
+                image = Image.open(image)
+            elif isinstance(image, bytes):
+                image = Image.open(BytesIO(image))
+            elif isinstance(image, dict):
+                if image["bytes"] is not None:
+                    image = Image.open(BytesIO(image["bytes"]))
+                else:
+                    image = Image.open(image["path"])
+
+            if not isinstance(image, ImageObject):
+                raise ValueError(f"Expect input is a list of images, but got {type(image)}.")
+
+            results.append(self._preprocess_image(image, **kwargs))
+
+        return {"images": results}
+
+    def _regularize_videos(self, videos: list["VideoInput"], **kwargs) -> "RegularizedVideoOutput":
+        r"""Regularizes videos to avoid error. Including reading, resizing and converting."""
+        results = []
+        durations = []
+        for video in videos:
+            frames: list[ImageObject] = []
+            if _check_video_is_nested_images(video):
+                for frame in video:
+                    if not is_valid_image(frame) and not isinstance(frame, dict) and not os.path.exists(frame):
+                        raise ValueError("Invalid image found in video frames.")
+                frames = video
+                durations.append(len(frames) / kwargs.get("video_fps", 2.0))
+            else:
+                container = av.open(video, "r")
+                video_stream = next(stream for stream in container.streams if stream.type == "video")
+                sample_indices = self._get_video_sample_indices(video_stream, **kwargs)
+                container.seek(0)
+                for frame_idx, frame in enumerate(container.decode(video_stream)):
+                    if frame_idx in sample_indices:
+                        frames.append(frame.to_image())
+
+                if video_stream.duration is None:
+                    durations.append(len(frames) / kwargs.get("video_fps", 2.0))
+                else:
+                    durations.append(float(video_stream.duration * video_stream.time_base))
+
+            frames = self._regularize_images(frames, **kwargs)["images"]
+            results.append(frames)
+
+        return {"videos": results, "durations": durations}
+
+    def _regularize_audios(
+        self, audios: list["AudioInput"], sampling_rate: float, **kwargs
+    ) -> "RegularizedAudioOutput":
+        r"""Regularizes audios to avoid error. Including reading and resampling."""
+        results, sampling_rates = [], []
+        for audio in audios:
+            if not isinstance(audio, np.ndarray):
+                audio, sr = torchaudio.load(audio)
+                if audio.shape[0] > 1:
+                    audio = audio.mean(dim=0, keepdim=True)
+
+                if sr != sampling_rate:
+                    audio = torchaudio.functional.resample(audio, sr, sampling_rate)
+
+                audio = audio.squeeze(0).numpy()
+
+            results.append(audio)
+            sampling_rates.append(sampling_rate)
+
+        return {"audios": results, "sampling_rates": sampling_rates}
+
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "MMProcessor",
+        imglens: list[int] | None = None,
+    ) -> dict[str, "torch.Tensor"]:
+        r"""Process visual inputs.
+
+        Returns: (llava and paligemma)
+            pixel_values: tensor with shape (B, C, H, W)
+
+        Returns: (qwen2-vl)
+            pixel_values: tensor with shape (num_patches, patch_dim)
+            image_grid_thw: tensor with shape (num_images, 3), where the three numbers are time, width, height
+                            where num_patches == torch.prod(image_grid_thw)
+
+        Returns: (mllama)
+            pixel_values: tensor with shape
+                          (batch_size, max_num_images, max_image_tiles, channels, tile_height, tile_width)
+                          For example, (2, 1, 4, 3, 560, 560).
+            aspect_ratio_ids: tensor with shape (batch_size, max_num_images). For example, (2, 1).
+            aspect_ratio_mask: tensor with shape (batch_size, max_num_images, max_image_tiles). For example, (2, 1, 4).
+            num_tiles: List[List[int]] with shape (batch_size, num_images_in_batch). For example, (2, 1).
+
+        """
+        mm_inputs = {}
+        if len(images) != 0:
+            image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 768 * 768),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+            if imglens is not None:  # if imglens are provided, make batched images
+                images = _make_batched_images(images, imglens)
+
+            image_processor_kwargs = {}
+            if getattr(processor, "image_do_pan_and_scan", False):  # gemma3 image processor
+                image_processor_kwargs.update(
+                    {
+                        "do_pan_and_scan": True,
+                        "pan_and_scan_min_crop_size": 256,
+                        "pan_and_scan_max_num_crops": 4,
+                        "pan_and_scan_min_ratio_to_activate": 1.2,
+                    }
+                )
+
+            mm_inputs.update(image_processor(images, return_tensors="pt", **image_processor_kwargs))
+
+        if len(videos) != 0:
+            video_processor: BaseImageProcessor = getattr(
+                processor, "video_processor", getattr(processor, "image_processor", None)
+            )
+            videos = self._regularize_videos(
+                videos,
+                image_max_pixels=getattr(processor, "video_max_pixels", 256 * 256),
+                image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 128),
+            )["videos"]
+            if "videos" in inspect.signature(video_processor.preprocess).parameters:  # for qwen2_vl and video_llava
+                mm_inputs.update(video_processor(images=None, videos=videos, return_tensors="pt"))
+            else:  # for llava_next_video
+                mm_inputs.update(video_processor(videos, return_tensors="pt"))
+
+        if len(audios) != 0:
+            feature_extractor: SequenceFeatureExtractor = getattr(processor, "feature_extractor", None)
+            audios = self._regularize_audios(
+                audios,
+                sampling_rate=getattr(processor, "audio_sampling_rate", 16000),
+            )["audios"]
+            mm_inputs.update(
+                feature_extractor(
+                    audios,
+                    sampling_rate=getattr(processor, "audio_sampling_rate", 16000),
+                    return_attention_mask=True,
+                    padding="max_length",
+                    return_tensors="pt",
+                )
+            )
+            mm_inputs["feature_attention_mask"] = mm_inputs.pop("attention_mask", None)  # prevent conflicts
+
+        return mm_inputs
+
+
+@dataclass
+class BasePlugin(MMPluginMixin):
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        r"""Pre-process input messages before tokenization for VLMs."""
+        self._validate_input(processor, images, videos, audios)
+        return messages
+
+    def process_token_ids(
+        self,
+        input_ids: list[int],
+        labels: list[int] | None,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        tokenizer: "PreTrainedTokenizer",
+        processor: Optional["MMProcessor"],
+    ) -> tuple[list[int], list[int] | None]:
+        r"""Pre-process token ids after tokenization for VLMs."""
+        self._validate_input(processor, images, videos, audios)
+        return input_ids, labels
+
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        r"""Build batched multimodal inputs for VLMs.
+
+        Arguments:
+            images: a list of image inputs, shape (num_images,)
+            videos: a list of video inputs, shape (num_videos,)
+            audios: a list of audio inputs, shape (num_audios,)
+            imglens: number of images in each sample, shape (batch_size,)
+            vidlens: number of videos in each sample, shape (batch_size,)
+            audlens: number of audios in each sample, shape (batch_size,)
+            batch_ids: token ids of input samples, shape (batch_size, seq_len)
+            processor: a processor for pre-processing images and videos
+
+        """
+        self._validate_input(processor, images, videos, audios)
+        return self._get_mm_inputs(images, videos, audios, processor)
+
+
+@dataclass
+class ErnieVLPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        messages = deepcopy(messages)
+
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+
+        merge_length: int = getattr(image_processor, "merge_size") ** 2
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            image_grid_thw = mm_inputs.get("image_grid_thw", [])
+            video_grid_thw = mm_inputs.get("video_grid_thw", [])
+        else:
+            image_grid_thw = [None] * len(images)
+            video_grid_thw = [None] * len(videos)
+
+        image_idx, video_idx = 0, 0
+        for message in messages:
+            content = message["content"]
+            image_token = self.image_token or "<|IMAGE_PLACEHOLDER|>"
+            video_token = self.video_token or "<|VIDEO_PLACEHOLDER|>"
+            while IMAGE_PLACEHOLDER in content:
+                image_seqlen = image_grid_thw[image_idx].prod() // merge_length if self.expand_mm_tokens else 1
+                content = content.replace(
+                    IMAGE_PLACEHOLDER,
+                    f"Picture {image_idx + 1}:<|IMAGE_START|>{image_token * image_seqlen}<|IMAGE_END|>",
+                    1,
+                )
+                image_idx += 1
+            while VIDEO_PLACEHOLDER in content:
+                video_seqlen = video_grid_thw[video_idx].prod() // merge_length if self.expand_mm_tokens else 1
+                content = content.replace(
+                    VIDEO_PLACEHOLDER,
+                    f"Video {video_idx + 1}:<|VIDEO_START|>{video_token * video_seqlen}<|VIDEO_END|>",
+                    1,
+                )
+                video_idx += 1
+            message["content"] = content
+        return messages
+
+
+@dataclass
+class Gemma3Plugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens = 0
+        messages = deepcopy(messages)
+        boi_token: str = getattr(processor, "boi_token")
+        full_image_sequence: str = getattr(processor, "full_image_sequence")
+        image_str = full_image_sequence if self.expand_mm_tokens else boi_token
+
+        do_pan_and_scan: bool = getattr(processor, "image_do_pan_and_scan", False)
+        if do_pan_and_scan:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                if do_pan_and_scan:
+                    image_placeholder_str = (
+                        "Here is the original image {{image}} and here are some crops to help you see better "
+                        + " ".join(["{{image}}"] * mm_inputs["num_crops"][0][num_image_tokens])
+                    )
+                else:
+                    image_placeholder_str = "{{image}}"
+
+                content = content.replace(IMAGE_PLACEHOLDER, image_placeholder_str, 1)
+                num_image_tokens += 1
+
+            message["content"] = content.replace("{{image}}", image_str)
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+        mm_inputs.pop("num_crops", None)
+        mm_inputs["token_type_ids"] = _get_gemma3_token_type_ids(batch_ids, processor)
+        return mm_inputs
+
+
+class Gemma3nPlugin(Gemma3Plugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        messages = deepcopy(messages)
+        boi_token: str = getattr(processor, "boi_token")
+        boa_token: str = getattr(processor, "boa_token")
+        full_image_sequence: str = getattr(processor, "full_image_sequence")
+        full_audio_sequence: str = getattr(processor, "full_audio_sequence")
+        image_str = full_image_sequence if self.expand_mm_tokens else boi_token
+        audio_str = full_audio_sequence if self.expand_mm_tokens else boa_token
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                content = content.replace(IMAGE_PLACEHOLDER, image_str, 1)
+
+            while AUDIO_PLACEHOLDER in content:
+                content = content.replace(AUDIO_PLACEHOLDER, audio_str, 1)
+
+            message["content"] = content
+
+        return messages
+
+
+@dataclass
+class InternVLPlugin(BasePlugin):
+    @override
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "ProcessorMixin",
+        **kwargs,
+    ) -> dict[str, "torch.Tensor"]:
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+        image_processor_kwargs = {}
+        if getattr(processor, "crop_to_patches", False):
+            image_processor_kwargs.update(
+                {
+                    "crop_to_patches": True,
+                    "max_patches": 12,
+                    "min_patches": 1,
+                }
+            )
+
+        mm_inputs = {}
+        image_video_patches = []
+
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 1024 * 1024),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+
+        if len(videos) != 0:
+            videos = self._regularize_videos(
+                videos,
+                image_max_pixels=getattr(processor, "video_max_pixels", 256 * 256),
+                image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 128),
+            )["videos"]
+
+        if len(images) != 0:
+            images = make_flat_list_of_images(images)
+            image_inputs = image_processor(images=images, return_tensors="pt", **image_processor_kwargs)
+            image_num_patches = image_inputs.pop("num_patches")
+            image_pixel_values = image_inputs.pop("pixel_values")
+            image_num_patches_indices = np.cumsum(image_num_patches)
+
+        if len(videos) != 0:
+            videos = make_batched_videos(videos)
+            num_frames_per_video = [len(video) for video in videos]
+            patch_indices = np.cumsum(num_frames_per_video)
+            image_processor_kwargs["crop_to_patches"] = False
+            video_inputs = image_processor(images=videos, return_tensors="pt", **image_processor_kwargs)
+            video_num_patches = video_inputs.pop("num_patches")
+            video_pixel_values = video_inputs.pop("pixel_values")
+            video_num_patches_indices = np.cumsum(video_num_patches)
+
+        # NOT SUPPORT IMAGE VIDEO INTERLEAVED
+        if len(images) != 0 and image_pixel_values is not None:
+            for i in range(len(images)):
+                start_index = image_num_patches_indices[i - 1] if i > 0 else 0
+                end_index = image_num_patches_indices[i]
+                image_video_patches.append(image_pixel_values[start_index:end_index])
+
+        if len(videos) != 0 and video_pixel_values is not None:
+            patch_indices_with_prefix = [0] + list(patch_indices)
+            for i in range(len(videos)):
+                current_patch_index = patch_indices_with_prefix[i]
+                end_patch_index = patch_indices_with_prefix[i + 1]
+                start_index = video_num_patches_indices[current_patch_index - 1] if i > 0 else 0
+                end_index = video_num_patches_indices[end_patch_index - 1]
+                image_video_patches.append(video_pixel_values[start_index:end_index])
+
+        if len(images) != 0 or len(videos) != 0:
+            mm_inputs["pixel_values"] = torch.cat(image_video_patches, dim=0)
+
+        if len(images) != 0:
+            mm_inputs.update({"image_num_patches": image_num_patches})
+
+        if len(videos) != 0:
+            mm_inputs.update({"video_patch_indices": patch_indices})
+            mm_inputs.update({"video_num_patches": video_num_patches})
+
+        return mm_inputs
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["ProcessorMixin"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens, num_video_tokens = 0, 0
+        image_seqlen = getattr(processor, "image_seq_length") if self.expand_mm_tokens else 1
+        messages = deepcopy(messages)
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+
+        image_pixel_patch_list = mm_inputs.get("image_num_patches")  # pathes of images
+        video_num_patches = mm_inputs.get("video_num_patches")  # all patches for frames of videos
+        video_patch_indices = mm_inputs.get("video_patch_indices")  # num frames of per video
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                content = content.replace(
+                    IMAGE_PLACEHOLDER,
+                    f"<img>{'<IMG_CONTEXT>' * image_seqlen * image_pixel_patch_list[num_image_tokens]}</img>",
+                    1,
+                )
+                num_image_tokens += 1
+
+            while VIDEO_PLACEHOLDER in content:
+                current_patch_index = video_patch_indices[num_video_tokens - 1] if num_video_tokens > 0 else 0
+                end_patch_index = video_patch_indices[num_video_tokens]
+                num_patches = list(video_num_patches[current_patch_index:end_patch_index])
+                video_replaced_prompt = "\n".join(
+                    f"Frame{i + 1}: <img>{'<IMG_CONTEXT>' * image_seqlen * num_patches[i]}</img>"
+                    for i in range(len(num_patches))
+                )
+                content = content.replace(VIDEO_PLACEHOLDER, video_replaced_prompt, 1)
+                num_video_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["ProcessorMixin"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+        mm_inputs.pop("image_num_patches", None)
+        mm_inputs.pop("video_patch_indices", None)
+        mm_inputs.pop("video_num_patches", None)
+        return mm_inputs
+
+
+class KimiVLPlugin(BasePlugin):
+    @override
+    def process_messages(self, messages, images, videos, audios, processor):
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            image_grid_hws = mm_inputs.get("image_grid_hws", [])
+        else:
+            image_grid_hws = [None] * len(images)
+
+        num_image_tokens = 0
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+        merge_length = math.prod(image_processor.merge_kernel_size)
+        messages = deepcopy(messages)
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                image_seqlen = image_grid_hws[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
+                content = content.replace(
+                    IMAGE_PLACEHOLDER,
+                    f"<|media_start|>image<|media_content|>{self.image_token * image_seqlen}<|media_end|>",
+                    1,
+                )
+                num_image_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+
+@dataclass
+class Llama4Plugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            if "pixel_values" in mm_inputs:
+                image_height, image_width = mm_inputs["pixel_values"][0].shape[-2:]
+                num_patches_per_chunk = int(
+                    (image_height // processor.patch_size)
+                    * (image_width // processor.patch_size)
+                    // processor.downsample_ratio
+                )
+                aspect_ratios = mm_inputs.pop("aspect_ratios")
+
+        num_image_tokens = 0
+        messages = deepcopy(messages)
+        for message in messages:
+            content = message["content"]
+            if self.expand_mm_tokens:
+                placeholder_count = content.count(IMAGE_PLACEHOLDER)
+                prompt_splits = content.split(IMAGE_PLACEHOLDER)
+                new_content = []
+                for local_image_index, split_part in enumerate(prompt_splits):
+                    new_content.append(split_part)
+                    if local_image_index < placeholder_count:
+                        tokens_for_this_image = processor._prompt_split_image(
+                            aspect_ratios[num_image_tokens], num_patches_per_chunk
+                        )
+                        num_image_tokens += 1
+                        new_content.append(tokens_for_this_image)
+
+                content = "".join(new_content)
+            else:
+                content = content.replace(IMAGE_PLACEHOLDER, self.image_token)
+
+            message["content"] = content
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+        mm_inputs.pop("aspect_ratios", None)
+        return mm_inputs
+
+
+@dataclass
+class LlavaPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        messages = deepcopy(messages)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            if "pixel_values" in mm_inputs:
+                height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values"][0]))
+                image_seqlen = (height // processor.patch_size) * (
+                    width // processor.patch_size
+                ) + processor.num_additional_image_tokens
+                if processor.vision_feature_select_strategy == "default":
+                    image_seqlen -= 1
+        else:
+            image_seqlen = 1
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}" * image_seqlen, 1)
+
+            message["content"] = content.replace("{{image}}", self.image_token)
+
+        return messages
+
+
+@dataclass
+class LlavaNextPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens = 0
+        messages = deepcopy(messages)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            if "pixel_values" in mm_inputs:
+                image_sizes = iter(mm_inputs["image_sizes"].tolist())
+                height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values"][0][0]))
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                if self.expand_mm_tokens:
+                    orig_height, orig_width = next(image_sizes)
+                    image_seqlen = processor._get_number_of_features(orig_height, orig_width, height, width)
+                    if processor.vision_feature_select_strategy == "default":
+                        image_seqlen -= 1
+                else:
+                    image_seqlen = 1
+
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}" * image_seqlen, 1)
+                num_image_tokens += 1
+
+            message["content"] = content.replace("{{image}}", self.image_token)
+
+        return messages
+
+
+@dataclass
+class LlavaNextVideoPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        messages = deepcopy(messages)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            if "pixel_values" in mm_inputs:
+                image_sizes = iter(mm_inputs["image_sizes"].tolist())
+                height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values"][0][0]))
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                if self.expand_mm_tokens:
+                    orig_height, orig_width = next(image_sizes)
+                    image_seqlen = processor._get_number_of_features(orig_height, orig_width, height, width)
+                    if processor.vision_feature_select_strategy == "default":
+                        image_seqlen -= 1
+                else:
+                    image_seqlen = 1
+
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}" * image_seqlen, 1)
+
+            message["content"] = content.replace("{{image}}", self.image_token)
+
+        if self.expand_mm_tokens:
+            if "pixel_values_videos" in mm_inputs:
+                one_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0])
+                height, width = get_image_size(one_video[0])
+                num_frames = one_video.shape[0]  # frame dim is always after batch dim
+                image_seqlen = (height // processor.patch_size) * (width // processor.patch_size)
+                video_seqlen = image_seqlen // 4 * num_frames  # divide by 4 needed for avg pooling layer
+        else:
+            video_seqlen = 1
+
+        for message in messages:
+            content = message["content"]
+            while VIDEO_PLACEHOLDER in content:
+                content = content.replace(VIDEO_PLACEHOLDER, "{{video}}" * video_seqlen, 1)
+
+            message["content"] = content.replace("{{video}}", self.video_token)
+
+        return messages
+
+
+@dataclass
+class MiniCPMVPlugin(BasePlugin):
+    @override
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "MMProcessor",
+        **kwargs,
+    ) -> dict[str, "torch.Tensor"]:
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+        mm_inputs = {}
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 768 * 768),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+            if "valid_image_nums_ls" in kwargs:
+                valid_image_nums_ls = kwargs["valid_image_nums_ls"]
+                new_images = []
+                idx = 0
+                for valid_image_nums in valid_image_nums_ls:
+                    new_images.append(images[idx : idx + valid_image_nums])
+                    idx += valid_image_nums
+
+                images = new_images
+
+            image_inputs = image_processor(
+                images, do_pad=True, max_slice_nums=image_processor.max_slice_nums, return_tensors="pt"
+            )
+            mm_inputs.update(image_inputs)
+
+        if len(videos) != 0:
+            videos = self._regularize_videos(
+                videos,
+                image_max_pixels=getattr(processor, "video_max_pixels", 256 * 256),
+                image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 128),
+            )["videos"]
+            video_inputs = image_processor(videos, do_pad=True, max_slice_nums=2, return_tensors="pt")
+            mm_inputs.update(video_inputs)
+
+        if len(audios) != 0:
+            audios = self._regularize_audios(
+                audios,
+                sampling_rate=getattr(processor, "audio_sampling_rate", 16000),
+            )["audios"]
+            if "valid_audio_nums_ls" in kwargs:
+                valid_audio_nums_ls = kwargs["valid_audio_nums_ls"]
+                audios_ls = []
+                idx = 0
+                for valid_audio_nums in valid_audio_nums_ls:
+                    audios_ls.append(audios[idx : idx + valid_audio_nums])
+                    idx += valid_audio_nums
+            else:
+                audios_ls = [audios]
+
+            audio_features, audio_feature_lens, audio_phs = processor.audio_feature_extract(
+                audios_ls,
+                chunk_input=True,
+                sampling_rate=getattr(processor, "audio_sampling_rate", 16000),
+            )
+            audio_feature_lens = [torch.tensor(audio_feature_len) for audio_feature_len in audio_feature_lens]
+            mm_inputs.update({"audio_features": audio_features, "audio_feature_lens": audio_feature_lens})
+            if kwargs.get("ret_phs", False):
+                mm_inputs.update({"audio_phs": audio_phs})
+
+        return mm_inputs
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens, num_video_tokens, num_audio_tokens = 0, 0, 0
+        messages = deepcopy(messages)
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+        mm_inputs, audio_inputs = {}, {}
+        if len(images) != 0 and len(videos) != 0:
+            raise ValueError("MiniCPM-V model does not support input images and videos at the same time.")
+
+        if len(videos) != 0:
+            max_slice_nums = 2
+            use_image_id = False
+            mm_inputs = self._get_mm_inputs([], videos, [], processor)
+        else:
+            max_slice_nums = image_processor.max_slice_nums
+            use_image_id = image_processor.use_image_id
+
+        for i, message in enumerate(messages):
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}", 1)
+                num_image_tokens += 1
+
+            while VIDEO_PLACEHOLDER in content:
+                video_seqlen = len(mm_inputs["pixel_values"][num_video_tokens]) if self.expand_mm_tokens else 1
+                content = content.replace(VIDEO_PLACEHOLDER, "{{image}}" * video_seqlen, 1)
+                num_video_tokens += 1
+
+            while AUDIO_PLACEHOLDER in content:
+                content = content.replace(AUDIO_PLACEHOLDER, "{{audio}}", 1)
+                num_audio_tokens += 1
+
+            message["content"] = content.replace("{{image}}", "(<image>./</image>)").replace(
+                "{{audio}}", "(<audio>./</audio>)"
+            )
+
+        if len(images):
+            mm_inputs = self._get_mm_inputs(images, [], [], processor)
+
+        if len(audios):
+            audio_inputs = self._get_mm_inputs([], [], audios, processor, ret_phs=True)
+
+        if self.expand_mm_tokens and mm_inputs:
+            pattern = "(<image>./</image>)"
+            image_sizes = mm_inputs["image_sizes"]
+            idx = 0
+            for index, message in enumerate(messages):
+                text = message["content"]
+                image_tags = re.findall(pattern, text)
+                text_chunks = text.split(pattern)
+                final_text = ""
+                for i in range(len(image_tags)):
+                    final_text = (
+                        final_text
+                        + text_chunks[i]
+                        + image_processor.get_slice_image_placeholder(
+                            image_sizes[0][idx], idx, max_slice_nums, use_image_id
+                        )
+                    )
+                    idx += 1
+
+                final_text += text_chunks[-1]
+                messages[index]["content"] = final_text
+
+        if self.expand_mm_tokens and audio_inputs:
+            pattern = "(<audio>./</audio>)"
+            idx = 0
+            for index, message in enumerate(messages):
+                text = message["content"]
+                audio_tags = re.findall(pattern, text)
+                text_chunks = text.split(pattern)
+                final_text = ""
+                for i in range(len(audio_tags)):
+                    audio_placeholder = audio_inputs["audio_phs"][0][idx]
+                    final_text = final_text + text_chunks[i] + audio_placeholder
+                    idx += 1
+
+                final_text += text_chunks[-1]
+                messages[index]["content"] = final_text
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        # image bound
+        image_bounds_list = []
+        valid_image_nums_ls = []
+        for i, input_ids in enumerate(batch_ids):
+            input_ids_ = torch.tensor(input_ids)
+            start_cond = (input_ids_ == processor.tokenizer.im_start_id) | (
+                input_ids_ == processor.tokenizer.slice_start_id
+            )
+            end_cond = (input_ids_ == processor.tokenizer.im_end_id) | (input_ids_ == processor.tokenizer.slice_end_id)
+            image_start_tokens = torch.where(start_cond)[0]
+            image_start_tokens += 1
+            image_end_tokens = torch.where(end_cond)[0]
+            valid_image_nums_ls.append(imglens[i])
+            image_bounds = torch.hstack(
+                [
+                    image_start_tokens.unsqueeze(-1),
+                    image_end_tokens.unsqueeze(-1),
+                ]
+            )
+            image_bounds_list.append(image_bounds)
+
+        mm_inputs = self._get_mm_inputs(images, videos, [], processor, valid_image_nums_ls=valid_image_nums_ls)
+        if "tgt_sizes" not in mm_inputs:
+            dummy_data = [torch.empty(0) for _ in range(len(batch_ids))]
+            mm_inputs.update({"tgt_sizes": dummy_data, "pixel_values": dummy_data, "image_sizes": dummy_data})
+
+        mm_inputs.update({"image_bound": image_bounds_list})
+
+        if len(audios) > 0:
+            # audio bound
+            audio_bounds_ls = []
+            spk_bounds_ls = []
+            valid_audio_nums_ls = []
+
+            for input_ids, audiolen in zip(batch_ids, audlens):
+                input_ids_ = torch.tensor(input_ids)
+                audio_start_idx = torch.where(input_ids_ == processor.tokenizer.audio_start_id)[0]
+                audio_end_idx = torch.where(input_ids_ == processor.tokenizer.audio_end_id)[0]
+                assert len(audio_start_idx) == len(audio_end_idx)
+                audio_bounds = torch.hstack([(audio_start_idx + 1).unsqueeze(-1), audio_end_idx.unsqueeze(-1)])
+                audio_bounds_ls.append(audio_bounds)
+                valid_audio_nums_ls.append(audiolen)
+
+                spk_start_idx = torch.where(input_ids_ == processor.tokenizer.spk_start_id)[0]
+                spk_end_idx = torch.where(input_ids_ == processor.tokenizer.spk_end_id)[0]
+                assert len(spk_start_idx) == len(spk_end_idx)
+                spk_bounds = torch.hstack([(spk_start_idx + 1).unsqueeze(-1), spk_end_idx.unsqueeze(-1)])
+                spk_bounds_ls.append(spk_bounds)
+
+            audio_inputs = self._get_mm_inputs([], [], audios, processor, valid_audio_nums_ls=valid_audio_nums_ls)
+            mm_inputs.update(audio_inputs)
+            mm_inputs.update({"audio_bounds": audio_bounds_ls, "spk_bounds": spk_bounds_ls})
+
+        return mm_inputs
+
+
+@dataclass
+class MllamaPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens = 0
+        messages = deepcopy(messages)
+        for message in messages:
+            content = message["content"]
+            num_image_tokens += content.count(IMAGE_PLACEHOLDER)
+            message["content"] = content.replace(IMAGE_PLACEHOLDER, self.image_token)
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor, imglens)
+        if mm_inputs:
+            num_tiles = mm_inputs.pop("num_tiles")
+            image_token_id: int = getattr(processor, "image_token_id")
+            max_image_tiles: int = getattr(processor.image_processor, "max_image_tiles")
+            cross_attention_token_mask = [
+                get_cross_attention_token_mask(input_ids, image_token_id) for input_ids in batch_ids
+            ]
+            mm_inputs["cross_attention_mask"] = torch.from_numpy(
+                convert_sparse_cross_attention_mask_to_dense(
+                    cross_attention_token_mask,
+                    num_tiles=num_tiles,
+                    max_num_tiles=max_image_tiles,
+                    length=max(len(input_ids) for input_ids in batch_ids),
+                )
+            )  # shape: (batch_size, length, max_num_images, max_num_tiles)
+
+        return mm_inputs
+
+
+@dataclass
+class PaliGemmaPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens = 0
+        messages = deepcopy(messages)
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                content = content.replace(IMAGE_PLACEHOLDER, "", 1)
+                num_image_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+    @override
+    def process_token_ids(
+        self,
+        input_ids: list[int],
+        labels: list[int] | None,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        tokenizer: "PreTrainedTokenizer",
+        processor: Optional["MMProcessor"],
+    ) -> tuple[list[int], list[int] | None]:
+        self._validate_input(processor, images, videos, audios)
+        num_images = len(images)
+        image_seqlen = processor.image_seq_length if self.expand_mm_tokens else 0  # skip mm token
+        image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        input_ids = [image_token_id] * num_images * image_seqlen + input_ids
+        if labels is not None:
+            labels = [IGNORE_INDEX] * num_images * image_seqlen + labels
+
+        return input_ids, labels
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        seqlens = [len(input_ids) for input_ids in batch_ids]
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+        mm_inputs["token_type_ids"] = _get_paligemma_token_type_ids(imglens, seqlens, processor)
+        return mm_inputs
+
+
+@dataclass
+class PixtralPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        messages = deepcopy(messages)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            if "pixel_values" in mm_inputs:
+                # BC for transformers < 4.49.0
+                if isinstance(mm_inputs["image_sizes"], list):
+                    image_sizes = iter(mm_inputs["image_sizes"][0])
+                else:
+                    image_sizes = iter(mm_inputs["image_sizes"].tolist())
+
+                image_break_token: str = getattr(processor, "image_break_token")
+                image_end_token: str = getattr(processor, "image_end_token")
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                if self.expand_mm_tokens:
+                    patch_size = processor.patch_size * getattr(processor, "spatial_merge_size", 1)
+                    height, width = next(image_sizes)
+                    num_height_tokens = height // patch_size
+                    num_width_tokens = width // patch_size
+                    replace_tokens = [[self.image_token] * num_width_tokens + [image_break_token]] * num_height_tokens
+                    replace_tokens = [item for sublist in replace_tokens for item in sublist]  # flatten list
+                    replace_tokens[-1] = image_end_token
+                    replace_str = "".join(replace_tokens)
+                else:
+                    replace_str = self.image_token
+
+                content = content.replace(IMAGE_PLACEHOLDER, replace_str, 1)
+
+            message["content"] = content
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+        # ref to this commit https://github.com/huggingface/transformers/pull/35122
+        # after transformers 4.49.0, the `image_sizes` is mandatory as an input parameter for Pixtral VisionEncoder forwarding.
+        # it can be passed into `LlavaConditionalGeneration` as a parameter.
+        if not is_transformers_version_greater_than("4.49.0"):
+            mm_inputs.pop("image_sizes", None)
+        return mm_inputs
+
+
+@dataclass
+class Qwen2AudioPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        bos_token: str = getattr(processor, "audio_bos_token")
+        eos_token: str = getattr(processor, "audio_eos_token")
+        messages = deepcopy(messages)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs([], [], audios, processor)
+            if "feature_attention_mask" in mm_inputs:
+                audio_lengths = mm_inputs["feature_attention_mask"].sum(-1).tolist()
+
+        for message in messages:
+            content = message["content"]
+            while AUDIO_PLACEHOLDER in content:
+                if self.expand_mm_tokens:
+                    audio_length = audio_lengths.pop(0)
+                    input_length = (audio_length - 1) // 2 + 1
+                    audio_seqlen = (input_length - 2) // 2 + 1
+                else:
+                    audio_seqlen = 1
+
+                content = content.replace(
+                    AUDIO_PLACEHOLDER, f"{bos_token}{self.audio_token * audio_seqlen}{eos_token}", 1
+                )
+
+            message["content"] = content
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        return self._get_mm_inputs(images, videos, audios, processor)
+
+
+@dataclass
+class Qwen2VLPlugin(BasePlugin):
+    vision_bos_token: str = "<|vision_start|>"
+    vision_eos_token: str = "<|vision_end|>"
+
+    @override
+    def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject":
+        image = super()._preprocess_image(image, **kwargs)
+        if min(image.width, image.height) < 28:
+            width, height = max(image.width, 28), max(image.height, 28)
+            image = image.resize((width, height))
+
+        if image.width / image.height > 200:
+            width, height = image.height * 180, image.height
+            image = image.resize((width, height))
+
+        if image.height / image.width > 200:
+            width, height = image.width, image.width * 180
+            image = image.resize((width, height))
+
+        return image
+
+    @override
+    def _regularize_videos(self, videos: list["VideoInput"], **kwargs) -> "RegularizedVideoOutput":
+        results, fps_per_video, durations = [], [], []
+        for video in videos:
+            frames: list[ImageObject] = []
+            if _check_video_is_nested_images(video):
+                for frame in video:
+                    if not is_valid_image(frame) and not isinstance(frame, dict) and not os.path.exists(frame):
+                        raise ValueError("Invalid image found in video frames.")
+
+                frames = video
+                fps_per_video.append(kwargs.get("video_fps", 2.0))
+                durations.append(len(frames) / kwargs.get("video_fps", 2.0))
+            else:
+                container = av.open(video, "r")
+                video_stream = next(stream for stream in container.streams if stream.type == "video")
+                sample_indices = self._get_video_sample_indices(video_stream, **kwargs)
+                container.seek(0)
+                for frame_idx, frame in enumerate(container.decode(video_stream)):
+                    if frame_idx in sample_indices:
+                        frames.append(frame.to_image())
+
+                if video_stream.duration is None:
+                    fps_per_video.append(kwargs.get("video_fps", 2.0))
+                    durations.append(len(frames) / kwargs.get("video_fps", 2.0))
+                else:
+                    fps_per_video.append(len(sample_indices) / float(video_stream.duration * video_stream.time_base))
+                    durations.append(float(video_stream.duration * video_stream.time_base))
+
+            if len(frames) % 2 != 0:
+                frames.append(frames[-1])
+
+            frames = self._regularize_images(frames, **kwargs)["images"]
+            results.append(frames)
+
+        return {"videos": results, "fps_per_video": fps_per_video, "durations": durations}
+
+    @override
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "MMProcessor",
+    ) -> dict[str, "torch.Tensor"]:
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+        video_processor: BaseVideoProcessor = getattr(processor, "video_processor", None)
+        mm_inputs = {}
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 768 * 768),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+            mm_inputs.update(image_processor(images, return_tensors="pt"))
+
+        if len(videos) != 0:
+            video_data = self._regularize_videos(
+                videos,
+                image_max_pixels=getattr(processor, "video_max_pixels", 256 * 256),
+                image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 128),
+            )
+            mm_inputs.update(video_processor(videos=video_data["videos"], return_tensors="pt"))
+            temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2)
+            if "second_per_grid_ts" in processor.model_input_names:
+                mm_inputs["second_per_grid_ts"] = [temporal_patch_size / fps for fps in video_data["fps_per_video"]]
+
+        return mm_inputs
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens, num_video_tokens = 0, 0
+        messages = deepcopy(messages)
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+
+        merge_length: int = getattr(image_processor, "merge_size") ** 2
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            image_grid_thw = mm_inputs.get("image_grid_thw", [])
+            video_grid_thw = mm_inputs.get("video_grid_thw", [])
+        else:
+            image_grid_thw = [None] * len(images)
+            video_grid_thw = [None] * len(videos)
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
+                content = content.replace(
+                    IMAGE_PLACEHOLDER,
+                    f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
+                    1,
+                )
+                num_image_tokens += 1
+
+            while VIDEO_PLACEHOLDER in content:
+                video_seqlen = video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1
+                content = content.replace(
+                    VIDEO_PLACEHOLDER,
+                    f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}",
+                    1,
+                )
+                num_video_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+
+@dataclass
+class Qwen3VLPlugin(Qwen2VLPlugin):
+    @override
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "MMProcessor",
+    ) -> dict[str, "torch.Tensor"]:
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+        video_processor: BaseImageProcessor = getattr(processor, "video_processor", None)
+        mm_inputs = {}
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 768 * 768),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+            mm_inputs.update(image_processor(images, return_tensors="pt"))
+
+        if len(videos) != 0:
+            videos = self._regularize_videos(
+                videos,
+                image_max_pixels=getattr(processor, "video_max_pixels", 256 * 256),
+                image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 128),
+            )
+            video_metadata = [
+                {"fps": getattr(processor, "video_fps", 24.0), "duration": duration, "total_num_frames": len(video)}
+                for video, duration in zip(videos["videos"], videos["durations"])
+            ]
+            mm_inputs.update(
+                video_processor(
+                    videos=videos["videos"],
+                    video_metadata=video_metadata,
+                    fps=getattr(processor, "video_fps", 2.0),
+                    return_metadata=True,
+                )
+            )
+            temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2)
+            if "second_per_grid_ts" in processor.model_input_names:
+                mm_inputs["second_per_grid_ts"] = [temporal_patch_size / fps for fps in videos["fps_per_video"]]
+
+        return mm_inputs
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens, num_video_tokens = 0, 0
+        messages = deepcopy(messages)
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+        video_processor: BaseImageProcessor = getattr(processor, "video_processor")
+
+        image_merge_length: int = getattr(image_processor, "merge_size") ** 2
+        video_merge_length: int = getattr(video_processor, "merge_size") ** 2
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            image_grid_thw = mm_inputs.get("image_grid_thw", [])
+            video_grid_thw = mm_inputs.get("video_grid_thw", [])
+            num_frames = video_grid_thw[0][0] if len(video_grid_thw) > 0 else 0  # hard code for now
+            video_metadata = mm_inputs.get("video_metadata", {})
+
+        else:
+            image_grid_thw = [None] * len(images)
+            video_grid_thw = [None] * len(videos)
+            num_frames = 0
+            timestamps = [0]
+
+        for idx, message in enumerate(messages):
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                image_seqlen = (
+                    image_grid_thw[num_image_tokens].prod() // image_merge_length if self.expand_mm_tokens else 1
+                )
+                content = content.replace(
+                    IMAGE_PLACEHOLDER,
+                    f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
+                    1,
+                )
+                num_image_tokens += 1
+
+            while VIDEO_PLACEHOLDER in content:
+                if self.expand_mm_tokens:
+                    metadata = video_metadata[idx]
+                    timestamps = processor._calculate_timestamps(
+                        metadata.frames_indices,
+                        metadata.fps,
+                        video_processor.merge_size,
+                    )
+                    video_structure = ""
+                    for frame_index in range(num_frames):
+                        video_seqlen = (
+                            video_grid_thw[num_video_tokens][1:].prod() // video_merge_length
+                            if self.expand_mm_tokens
+                            else 1
+                        )
+                        timestamp_sec = timestamps[frame_index]
+                        frame_structure = (
+                            f"<{timestamp_sec:.1f} seconds>"
+                            f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}"
+                        )
+                        video_structure += frame_structure
+                else:
+                    video_structure = f"{self.vision_bos_token}{self.video_token}{self.vision_eos_token}"
+
+                content = content.replace(VIDEO_PLACEHOLDER, video_structure, 1)
+                num_video_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+
+@dataclass
+class GLM4VPlugin(Qwen2VLPlugin):
+    @override
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "MMProcessor",
+    ) -> dict[str, "torch.Tensor"]:
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+        video_processor: BaseImageProcessor = getattr(processor, "video_processor", None)
+        mm_inputs = {}
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 768 * 768),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+            mm_inputs.update(image_processor(images, return_tensors="pt"))
+
+        if len(videos) != 0:
+            video_data = self._regularize_videos(
+                videos,
+                image_max_pixels=getattr(processor, "video_max_pixels", 256 * 256),
+                image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 128),
+            )
+            # prepare video metadata
+            video_metadata = [
+                {"fps": 2, "duration": duration, "total_frames": len(video)}
+                for video, duration in zip(video_data["videos"], video_data["durations"])
+            ]
+            mm_inputs.update(video_processor(images=None, videos=video_data["videos"], video_metadata=video_metadata))
+
+        return mm_inputs
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens, num_video_tokens = 0, 0
+        messages = deepcopy(messages)
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+
+        merge_length: int = getattr(image_processor, "merge_size") ** 2
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            image_grid_thw = mm_inputs.get("image_grid_thw", [])
+            video_grid_thw = mm_inputs.get("video_grid_thw", [])
+            num_frames = video_grid_thw[0][0] if len(video_grid_thw) > 0 else 0  # hard code for now
+            timestamps = mm_inputs.get("timestamps", [])
+
+            if hasattr(timestamps, "tolist"):
+                timestamps = timestamps.tolist()
+
+            if not timestamps:
+                timestamps_list = []
+            elif isinstance(timestamps[0], list):
+                timestamps_list = timestamps[0]
+            else:
+                timestamps_list = timestamps
+
+            unique_timestamps = timestamps_list.copy()
+            selected_timestamps = unique_timestamps[:num_frames]
+            while len(selected_timestamps) < num_frames:
+                selected_timestamps.append(selected_timestamps[-1] if selected_timestamps else 0)
+
+        else:
+            image_grid_thw = [None] * len(images)
+            video_grid_thw = [None] * len(videos)
+            num_frames = 0
+            selected_timestamps = [0]
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
+                content = content.replace(
+                    IMAGE_PLACEHOLDER, f"<|begin_of_image|>{self.image_token * image_seqlen}<|end_of_image|>", 1
+                )
+                num_image_tokens += 1
+
+            while VIDEO_PLACEHOLDER in content:
+                video_structure = ""
+                for frame_index in range(num_frames):
+                    video_seqlen = (
+                        video_grid_thw[num_video_tokens][1:].prod() // merge_length if self.expand_mm_tokens else 1
+                    )
+                    timestamp_sec = selected_timestamps[frame_index]
+                    frame_structure = (
+                        f"<|begin_of_image|>{self.image_token * video_seqlen}<|end_of_image|>{timestamp_sec}"
+                    )
+                    video_structure += frame_structure
+
+                if not self.expand_mm_tokens:
+                    video_structure = self.video_token
+
+                content = content.replace(VIDEO_PLACEHOLDER, f"<|begin_of_video|>{video_structure}<|end_of_video|>", 1)
+                num_video_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["ProcessorMixin"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+        mm_inputs.pop("timestamps", None)
+        return mm_inputs
+
+
+@dataclass
+class Qwen2OmniPlugin(Qwen2VLPlugin):
+    audio_bos_token: str = "<|audio_start|>"
+    audio_eos_token: str = "<|audio_end|>"
+
+    @override
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "MMProcessor",
+    ) -> dict[str, "torch.Tensor"]:
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+        video_processor: BaseVideoProcessor = getattr(processor, "video_processor", None)
+        feature_extractor: SequenceFeatureExtractor = getattr(processor, "feature_extractor", None)
+        mm_inputs = {}
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 768 * 768),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+            mm_inputs.update(image_processor(images, return_tensors="pt"))
+
+        if len(videos) != 0:
+            video_dict = self._regularize_videos(
+                videos,
+                image_max_pixels=getattr(processor, "video_max_pixels", 256 * 256),
+                image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 128),
+            )
+            mm_inputs.update(video_processor(videos=video_dict["videos"], return_tensors="pt"))
+            temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2)
+            mm_inputs["video_second_per_grid"] = torch.tensor(
+                [temporal_patch_size / fps for fps in video_dict["fps_per_video"]]
+            )
+
+        if len(audios) != 0:
+            audios = self._regularize_audios(
+                audios,
+                sampling_rate=getattr(processor, "audio_sampling_rate", 16000),
+            )["audios"]
+            mm_inputs.update(
+                feature_extractor(
+                    audios,
+                    sampling_rate=getattr(processor, "audio_sampling_rate", 16000),
+                    return_attention_mask=True,
+                    padding="max_length",
+                    return_tensors="pt",
+                )
+            )
+            mm_inputs["feature_attention_mask"] = mm_inputs.pop("attention_mask")  # prevent conflicts
+
+        return mm_inputs
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens, num_video_tokens, num_audio_tokens = 0, 0, 0
+        messages = deepcopy(messages)
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+
+        merge_length = processor.image_processor.merge_size**2
+        use_audio_in_video = getattr(processor, "use_audio_in_video", False)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            image_grid_thw = mm_inputs.get("image_grid_thw", [])
+            video_grid_thw = mm_inputs.get("video_grid_thw", [])
+            if "feature_attention_mask" in mm_inputs:
+                if processor.__class__.__name__ == "Qwen3OmniMoeProcessor":  # for qwen3omni
+                    input_lengths = mm_inputs["feature_attention_mask"].sum(-1)
+                    input_lengths_leave = input_lengths % 100
+                    feature_lengths = (input_lengths_leave - 1) // 2 + 1
+                    audio_lengths = ((feature_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
+                else:
+                    input_lengths = (mm_inputs["feature_attention_mask"].sum(-1).numpy() - 1) // 2 + 1
+                    audio_lengths = (input_lengths - 2) // 2 + 1
+        else:
+            mm_inputs = {}
+            image_grid_thw = [None] * len(images)
+            video_grid_thw = [None] * len(videos)
+            audio_lengths = [None] * len(audios)
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
+                content = content.replace(
+                    IMAGE_PLACEHOLDER,
+                    f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
+                    1,
+                )
+                num_image_tokens += 1
+
+            if (
+                use_audio_in_video and len(audios) and len(videos)
+            ):  # if use the audio of video # deal video token and audio token togather
+                if len(videos) != len(audios):
+                    raise ValueError(
+                        f"Number of videos ({len(videos)}) must match number of audios ({len(audios)}) when using audio in video."
+                    )
+
+                while VIDEO_PLACEHOLDER in content:
+                    video_pos = content.find(VIDEO_PLACEHOLDER)
+                    audio_pos = content.find(AUDIO_PLACEHOLDER, video_pos)
+                    if audio_pos == -1 or audio_pos < video_pos:
+                        raise ValueError(
+                            f"Each {VIDEO_PLACEHOLDER} must be followed by an {AUDIO_PLACEHOLDER} when using audio in video."
+                        )
+
+                    audio_t_index = torch.arange(audio_lengths[num_audio_tokens])
+                    video_t_index = (
+                        torch.arange(video_grid_thw[num_video_tokens][0])
+                        .view(-1, 1, 1)
+                        .expand(
+                            -1,
+                            video_grid_thw[num_video_tokens][1] // image_processor.merge_size,
+                            video_grid_thw[num_video_tokens][2] // image_processor.merge_size,
+                        )
+                        .flatten()
+                        * mm_inputs["video_second_per_grid"][num_video_tokens]
+                        * 25  # FIXME hardcode of position_id_per_seconds=25
+                    ).long()
+                    t_ntoken_per_chunk = 50  # FIXME hardcode: [25 * 2]
+                    video_chunk_indices = processor.get_chunked_index(video_t_index, t_ntoken_per_chunk)
+                    audio_chunk_indices = processor.get_chunked_index(audio_t_index, t_ntoken_per_chunk)
+                    placeholder_string = ""
+                    placeholder_string += self.vision_bos_token + self.audio_bos_token
+                    for j in range(max(len(video_chunk_indices), len(audio_chunk_indices))):
+                        video_chunk_index = video_chunk_indices[j] if j < len(video_chunk_indices) else None
+                        audio_chunk_index = audio_chunk_indices[j] if j < len(audio_chunk_indices) else None
+                        if video_chunk_index is not None:
+                            placeholder_string += self.video_token * (video_chunk_index[1] - video_chunk_index[0])
+
+                        if audio_chunk_index is not None:
+                            placeholder_string += self.audio_token * (audio_chunk_index[1] - audio_chunk_index[0])
+
+                    placeholder_string += self.audio_eos_token + self.vision_eos_token
+                    content = content.replace(VIDEO_PLACEHOLDER, placeholder_string, 1)
+                    content = content.replace(AUDIO_PLACEHOLDER, "", 1)
+                    num_audio_tokens += 1
+                    num_video_tokens += 1
+            else:
+                while AUDIO_PLACEHOLDER in content:
+                    audio_seqlen = audio_lengths[num_audio_tokens] if self.expand_mm_tokens else 1
+                    content = content.replace(
+                        AUDIO_PLACEHOLDER,
+                        f"{self.audio_bos_token}{self.audio_token * audio_seqlen}{self.audio_eos_token}",
+                        1,
+                    )
+                    num_audio_tokens += 1
+
+                while VIDEO_PLACEHOLDER in content:
+                    video_seqlen = (
+                        video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1
+                    )
+                    content = content.replace(
+                        VIDEO_PLACEHOLDER,
+                        f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}",
+                        1,
+                    )
+                    num_video_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+
+@dataclass
+class VideoLlavaPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens, num_video_tokens = 0, 0
+        messages = deepcopy(messages)
+        num_frames = 0
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            if "pixel_values_images" in mm_inputs:
+                height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values_images"][0]))
+                num_frames = 1
+
+            if "pixel_values_videos" in mm_inputs:
+                one_video = to_numpy_array(mm_inputs["pixel_values_videos"][0])
+                height, width = get_image_size(one_video[0])
+                num_frames = one_video.shape[0]  # frame dim is always after batch dim
+
+            if "pixel_values_images" in mm_inputs or "pixel_values_videos" in mm_inputs:
+                image_seqlen = (height // processor.patch_size) * (
+                    width // processor.patch_size
+                ) + processor.num_additional_image_tokens
+                video_seqlen = image_seqlen * num_frames
+                if processor.vision_feature_select_strategy == "default":
+                    image_seqlen -= 1
+        else:
+            image_seqlen, video_seqlen = 1, 1
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}" * image_seqlen, 1)
+                num_image_tokens += 1
+
+            while VIDEO_PLACEHOLDER in content:
+                content = content.replace(VIDEO_PLACEHOLDER, "{{video}}" * video_seqlen, 1)
+                num_video_tokens += 1
+
+            content = content.replace("{{image}}", self.image_token)
+            message["content"] = content.replace("{{video}}", self.video_token)
+
+        return messages
+
+
+@dataclass
+class LFMVLPlugin(BasePlugin):
+    r"""Plugin for LFM2.5-VL vision-language models.
+
+    LFM2.5-VL uses dynamic image token counts based on image resolution.
+    The image processor returns spatial_shapes tensor with [height, width] grid dimensions.
+    Token count per image = (spatial_h * spatial_w) / (downsample_factor^2)
+    """
+
+    @override
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "MMProcessor",
+    ) -> dict[str, "torch.Tensor"]:
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+        mm_inputs = {}
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 768 * 768),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+            mm_inputs.update(image_processor(images, return_tensors="pt"))
+        return mm_inputs
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens = 0
+        messages = deepcopy(messages)
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+        downsample_factor: int = getattr(image_processor, "downsample_factor", 2)
+
+        if self.expand_mm_tokens and len(images) > 0:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            spatial_shapes = mm_inputs.get("spatial_shapes", [])
+        else:
+            spatial_shapes = []
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                if self.expand_mm_tokens and len(spatial_shapes) > num_image_tokens:
+                    h, w = spatial_shapes[num_image_tokens].tolist()
+                    image_seqlen = (h * w) // (downsample_factor * downsample_factor)
+                else:
+                    image_seqlen = 1
+
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}" * image_seqlen, 1)
+                num_image_tokens += 1
+
+            message["content"] = content.replace("{{image}}", self.image_token)
+
+        return messages
+
+
+@dataclass
+class YoutuVLPlugin(BasePlugin):
+    r"""Plugin for Youtu-VL vision-language models."""
+
+    vision_bos_token: str = "<|vision_start|>"
+    vision_eos_token: str = "<|vision_end|>"
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        messages = deepcopy(messages)
+
+        for message in messages:
+            content = message["content"]
+            content = content.replace(
+                IMAGE_PLACEHOLDER, f"{self.vision_bos_token}{self.image_token}{self.vision_eos_token}"
+            )
+            content = content.replace(
+                VIDEO_PLACEHOLDER, f"{self.vision_bos_token}{self.video_token}{self.vision_eos_token}"
+            )
+
+            message["content"] = content
+
+        return messages
+
+
+PLUGINS = {
+    "base": BasePlugin,
+    "ernie_vl": ErnieVLPlugin,
+    "gemma3": Gemma3Plugin,
+    "glm4v": GLM4VPlugin,
+    "gemma3n": Gemma3nPlugin,
+    "intern_vl": InternVLPlugin,
+    "kimi_vl": KimiVLPlugin,
+    "llama4": Llama4Plugin,
+    "llava": LlavaPlugin,
+    "llava_next": LlavaNextPlugin,
+    "llava_next_video": LlavaNextVideoPlugin,
+    "lfm2_vl": LFMVLPlugin,
+    "minicpm_v": MiniCPMVPlugin,
+    "mllama": MllamaPlugin,
+    "paligemma": PaliGemmaPlugin,
+    "pixtral": PixtralPlugin,
+    "qwen2_audio": Qwen2AudioPlugin,
+    "qwen2_omni": Qwen2OmniPlugin,
+    "qwen2_vl": Qwen2VLPlugin,
+    "qwen3_vl": Qwen3VLPlugin,
+    "video_llava": VideoLlavaPlugin,
+    "youtu_vl": YoutuVLPlugin,
+}
+
+
+def register_mm_plugin(name: str, plugin_class: type["BasePlugin"]) -> None:
+    r"""Register a multimodal plugin."""
+    if name in PLUGINS:
+        raise ValueError(f"Multimodal plugin {name} already exists.")
+
+    PLUGINS[name] = plugin_class
+
+
+def get_mm_plugin(
+    name: str,
+    image_token: str | None = None,
+    video_token: str | None = None,
+    audio_token: str | None = None,
+    **kwargs,
+) -> "BasePlugin":
+    r"""Get plugin for multimodal inputs."""
+    if name not in PLUGINS:
+        raise ValueError(f"Multimodal plugin `{name}` not found.")
+
+    return PLUGINS[name](image_token, video_token, audio_token, **kwargs)
diff --git a/LlamaFactory/src/llamafactory/data/parser.py b/LlamaFactory/src/llamafactory/data/parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..5209da64954a363c59d232010111139b7b37fdfe
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/parser.py
@@ -0,0 +1,149 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from dataclasses import dataclass
+from typing import Any, Literal
+
+from huggingface_hub import hf_hub_download
+
+from ..extras.constants import DATA_CONFIG
+from ..extras.misc import use_modelscope, use_openmind
+
+
+@dataclass
+class DatasetAttr:
+    r"""Dataset attributes."""
+
+    # basic configs
+    load_from: Literal["hf_hub", "ms_hub", "om_hub", "script", "file"]
+    dataset_name: str
+    formatting: Literal["alpaca", "sharegpt", "openai"] = "alpaca"
+    ranking: bool = False
+    # extra configs
+    subset: str | None = None
+    split: str = "train"
+    folder: str | None = None
+    num_samples: int | None = None
+    # common columns
+    system: str | None = None
+    tools: str | None = None
+    images: str | None = None
+    videos: str | None = None
+    audios: str | None = None
+    # dpo columns
+    chosen: str | None = None
+    rejected: str | None = None
+    kto_tag: str | None = None
+    # alpaca columns
+    prompt: str | None = "instruction"
+    query: str | None = "input"
+    response: str | None = "output"
+    history: str | None = None
+    # sharegpt columns
+    messages: str | None = "conversations"
+    # sharegpt tags
+    role_tag: str | None = "from"
+    content_tag: str | None = "value"
+    user_tag: str | None = "human"
+    assistant_tag: str | None = "gpt"
+    observation_tag: str | None = "observation"
+    function_tag: str | None = "function_call"
+    system_tag: str | None = "system"
+
+    def __repr__(self) -> str:
+        return self.dataset_name
+
+    def set_attr(self, key: str, obj: dict[str, Any], default: Any | None = None) -> None:
+        setattr(self, key, obj.get(key, default))
+
+    def join(self, attr: dict[str, Any]) -> None:
+        self.set_attr("formatting", attr, default="alpaca")
+        self.set_attr("ranking", attr, default=False)
+        self.set_attr("subset", attr)
+        self.set_attr("split", attr, default="train")
+        self.set_attr("folder", attr)
+        self.set_attr("num_samples", attr)
+
+        if "columns" in attr:
+            column_names = ["prompt", "query", "response", "history", "messages", "system", "tools"]
+            column_names += ["images", "videos", "audios", "chosen", "rejected", "kto_tag"]
+            for column_name in column_names:
+                self.set_attr(column_name, attr["columns"])
+
+        if "tags" in attr:
+            tag_names = ["role_tag", "content_tag"]
+            tag_names += ["user_tag", "assistant_tag", "observation_tag", "function_tag", "system_tag"]
+            for tag in tag_names:
+                self.set_attr(tag, attr["tags"])
+
+
+def get_dataset_list(dataset_names: list[str] | None, dataset_dir: str | dict) -> list["DatasetAttr"]:
+    r"""Get the attributes of the datasets."""
+    if dataset_names is None:
+        dataset_names = []
+
+    if isinstance(dataset_dir, dict):
+        dataset_info = dataset_dir
+    elif dataset_dir == "ONLINE":
+        dataset_info = None
+    else:
+        if dataset_dir.startswith("REMOTE:"):
+            config_path = hf_hub_download(repo_id=dataset_dir[7:], filename=DATA_CONFIG, repo_type="dataset")
+        else:
+            config_path = os.path.join(dataset_dir, DATA_CONFIG)
+
+        try:
+            with open(config_path) as f:
+                dataset_info = json.load(f)
+        except Exception as err:
+            if len(dataset_names) != 0:
+                raise ValueError(f"Cannot open {config_path} due to {str(err)}.")
+
+            dataset_info = None
+
+    dataset_list: list[DatasetAttr] = []
+    for name in dataset_names:
+        if dataset_info is None:  # dataset_dir is ONLINE
+            load_from = "ms_hub" if use_modelscope() else "om_hub" if use_openmind() else "hf_hub"
+            dataset_attr = DatasetAttr(load_from, dataset_name=name)
+            dataset_list.append(dataset_attr)
+            continue
+
+        if name not in dataset_info:
+            raise ValueError(f"Undefined dataset {name} in {DATA_CONFIG}.")
+
+        has_hf_url = "hf_hub_url" in dataset_info[name]
+        has_ms_url = "ms_hub_url" in dataset_info[name]
+        has_om_url = "om_hub_url" in dataset_info[name]
+
+        if has_hf_url or has_ms_url or has_om_url:
+            if has_ms_url and (use_modelscope() or not has_hf_url):
+                dataset_attr = DatasetAttr("ms_hub", dataset_name=dataset_info[name]["ms_hub_url"])
+            elif has_om_url and (use_openmind() or not has_hf_url):
+                dataset_attr = DatasetAttr("om_hub", dataset_name=dataset_info[name]["om_hub_url"])
+            else:
+                dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"])
+        elif "script_url" in dataset_info[name]:
+            dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
+        elif "cloud_file_name" in dataset_info[name]:
+            dataset_attr = DatasetAttr("cloud_file", dataset_name=dataset_info[name]["cloud_file_name"])
+        else:
+            dataset_attr = DatasetAttr("file", dataset_name=dataset_info[name]["file_name"])
+
+        dataset_attr.join(dataset_info[name])
+        dataset_list.append(dataset_attr)
+
+    return dataset_list
diff --git a/LlamaFactory/src/llamafactory/data/processor/__init__.py b/LlamaFactory/src/llamafactory/data/processor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..357ab7899f9eecbd29344482d109b89af274ea2e
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/processor/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .feedback import FeedbackDatasetProcessor
+from .pairwise import PairwiseDatasetProcessor
+from .pretrain import PretrainDatasetProcessor
+from .processor_utils import DatasetProcessor
+from .supervised import PackedSupervisedDatasetProcessor, SupervisedDatasetProcessor
+from .unsupervised import UnsupervisedDatasetProcessor
+
+
+__all__ = [
+    "DatasetProcessor",
+    "FeedbackDatasetProcessor",
+    "PackedSupervisedDatasetProcessor",
+    "PairwiseDatasetProcessor",
+    "PretrainDatasetProcessor",
+    "SupervisedDatasetProcessor",
+    "UnsupervisedDatasetProcessor",
+]
diff --git a/LlamaFactory/src/llamafactory/data/processor/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/processor/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b68184f3e51885aec47e0c7607dc9cabd0d7c904
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/processor/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/processor/__pycache__/__init__.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/processor/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb5a3473d7073875355f8ff8523434d7c74f40fc
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/processor/__pycache__/__init__.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/processor/__pycache__/feedback.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/processor/__pycache__/feedback.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e8401149ab1bcae6dc386b5ccf65b03ac45e9a2
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/processor/__pycache__/feedback.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/processor/__pycache__/feedback.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/processor/__pycache__/feedback.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e0fedcc03fc5f66fbdeacf9fe775c9c4917b6f3
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/processor/__pycache__/feedback.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/processor/__pycache__/pairwise.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/processor/__pycache__/pairwise.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca8634f937d84b188244ac9742315258b4977132
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/processor/__pycache__/pairwise.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/processor/__pycache__/pairwise.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/processor/__pycache__/pairwise.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..579e10f4b358f018926b65c94376f2e31ea48fb4
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/processor/__pycache__/pairwise.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/processor/__pycache__/pretrain.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/processor/__pycache__/pretrain.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1871b7b18b751b32ed18bf5f6120af7096439b1
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/processor/__pycache__/pretrain.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/processor/__pycache__/pretrain.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/processor/__pycache__/pretrain.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25caacb507aed2463383dfe95c8adf203ab03d72
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/processor/__pycache__/pretrain.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/processor/__pycache__/processor_utils.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/processor/__pycache__/processor_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ae010fdeac8161828c27bdfc500f1d0a4a976a0
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/processor/__pycache__/processor_utils.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/processor/__pycache__/processor_utils.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/processor/__pycache__/processor_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..404d7b0d88f68529ba75557a8f5b4c8a0b32ec36
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/processor/__pycache__/processor_utils.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/processor/__pycache__/supervised.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/processor/__pycache__/supervised.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a5cf53e4b86ee798626b93c207d88f40ada3380
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/processor/__pycache__/supervised.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/processor/__pycache__/supervised.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/processor/__pycache__/supervised.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d78b8332c821e11b63fba89873ee2eec98ca1fa6
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/processor/__pycache__/supervised.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/processor/__pycache__/unsupervised.cpython-311.pyc b/LlamaFactory/src/llamafactory/data/processor/__pycache__/unsupervised.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..226cc51e84280fd486808a42b053031bbc0fbef8
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/processor/__pycache__/unsupervised.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/processor/__pycache__/unsupervised.cpython-312.pyc b/LlamaFactory/src/llamafactory/data/processor/__pycache__/unsupervised.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..374b93dc9bd351a038f45edd06ee34a554270107
Binary files /dev/null and b/LlamaFactory/src/llamafactory/data/processor/__pycache__/unsupervised.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/data/processor/feedback.py b/LlamaFactory/src/llamafactory/data/processor/feedback.py
new file mode 100644
index 0000000000000000000000000000000000000000..871615b9266e501f25f68e84e4536c0d24617803
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/processor/feedback.py
@@ -0,0 +1,129 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Optional
+
+from ...extras import logging
+from ...extras.constants import IGNORE_INDEX
+from .processor_utils import DatasetProcessor, infer_seqlen
+
+
+if TYPE_CHECKING:
+    from ..mm_plugin import AudioInput, ImageInput, VideoInput
+
+
+logger = logging.get_logger(__name__)
+
+
+class FeedbackDatasetProcessor(DatasetProcessor):
+    def _encode_data_example(
+        self,
+        prompt: list[dict[str, str]],
+        response: list[dict[str, str]],
+        kl_response: list[dict[str, str]],
+        system: Optional[str],
+        tools: Optional[str],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+    ) -> tuple[list[int], list[int], list[int], list[int], bool]:
+        if response[0]["content"]:  # desired example
+            kto_tag = True
+            messages = prompt + [response[0]]
+        else:  # undesired example
+            kto_tag = False
+            messages = prompt + [response[1]]
+
+        if kl_response[0]["content"]:
+            kl_messages = prompt + [kl_response[0]]
+        else:
+            kl_messages = prompt + [kl_response[1]]
+
+        messages = self.template.mm_plugin.process_messages(messages, images, videos, audios, self.processor)
+        kl_messages = self.template.mm_plugin.process_messages(kl_messages, images, videos, audios, self.processor)
+        prompt_ids, response_ids = self.template.encode_oneturn(self.tokenizer, messages, system, tools)
+        kl_prompt_ids, kl_response_ids = self.template.encode_oneturn(self.tokenizer, kl_messages, system, tools)
+
+        if self.template.efficient_eos:
+            response_ids += [self.tokenizer.eos_token_id]
+            kl_response_ids += [self.tokenizer.eos_token_id]
+
+        prompt_ids, _ = self.template.mm_plugin.process_token_ids(
+            prompt_ids, None, images, videos, audios, self.tokenizer, self.processor
+        )
+        kl_prompt_ids, _ = self.template.mm_plugin.process_token_ids(
+            kl_prompt_ids, None, images, videos, audios, self.tokenizer, self.processor
+        )
+
+        source_len, target_len = infer_seqlen(len(prompt_ids), len(response_ids), self.data_args.cutoff_len)
+        prompt_ids = prompt_ids[:source_len]
+        response_ids = response_ids[:target_len]
+        kl_source_len, kl_target_len = infer_seqlen(
+            len(kl_prompt_ids), len(kl_response_ids), self.data_args.cutoff_len
+        )
+        kl_prompt_ids = kl_prompt_ids[:kl_source_len]
+        kl_response_ids = kl_response_ids[:kl_target_len]
+
+        input_ids = prompt_ids + response_ids
+        labels = [IGNORE_INDEX] * source_len + response_ids
+        kl_input_ids = kl_prompt_ids + kl_response_ids
+        kl_labels = [IGNORE_INDEX] * kl_source_len + kl_response_ids
+        return input_ids, labels, kl_input_ids, kl_labels, kto_tag
+
+    def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        # Creates mismatched pairs of prompts and completions for the KL dataset by adding a +1 offset to the order of completions.
+        kl_response = [examples["_response"][-1]] + examples["_response"][:-1]
+        model_inputs = defaultdict(list)
+        for i in range(len(examples["_prompt"])):
+            if len(examples["_prompt"][i]) % 2 != 1 or len(examples["_response"][i]) < 2:
+                logger.warning_rank0(
+                    "Dropped invalid example: {}".format(examples["_prompt"][i] + examples["_response"][i])
+                )
+                continue
+
+            input_ids, labels, kl_input_ids, kl_labels, kto_tag = self._encode_data_example(
+                prompt=examples["_prompt"][i],
+                response=examples["_response"][i],
+                kl_response=kl_response[i],
+                system=examples["_system"][i],
+                tools=examples["_tools"][i],
+                images=examples["_images"][i] or [],
+                videos=examples["_videos"][i] or [],
+                audios=examples["_audios"][i] or [],
+            )
+            model_inputs["input_ids"].append(input_ids)
+            model_inputs["attention_mask"].append([1] * len(input_ids))
+            model_inputs["labels"].append(labels)
+            model_inputs["kl_input_ids"].append(kl_input_ids)
+            model_inputs["kl_attention_mask"].append([1] * len(kl_input_ids))
+            model_inputs["kl_labels"].append(kl_labels)
+            model_inputs["kto_tags"].append(kto_tag)
+            model_inputs["images"].append(examples["_images"][i])
+            model_inputs["videos"].append(examples["_videos"][i])
+            model_inputs["audios"].append(examples["_audios"][i])
+
+        desirable_num = sum([1 for tag in model_inputs["kto_tags"] if tag])
+        undesirable_num = len(model_inputs["kto_tags"]) - desirable_num
+        if desirable_num == 0 or undesirable_num == 0:
+            logger.warning_rank0("Your dataset only has one preference type.")
+
+        return model_inputs
+
+    def print_data_example(self, example: dict[str, list[int]]) -> None:
+        valid_labels = list(filter(lambda x: x != IGNORE_INDEX, example["labels"]))
+        print("input_ids:\n{}".format(example["input_ids"]))
+        print("inputs:\n{}".format(self.tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
+        print("label_ids:\n{}".format(example["labels"]))
+        print(f"labels:\n{self.tokenizer.decode(valid_labels, skip_special_tokens=False)}")
diff --git a/LlamaFactory/src/llamafactory/data/processor/pairwise.py b/LlamaFactory/src/llamafactory/data/processor/pairwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..94101deb8e75af73c1851720604994a11f2eb87d
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/processor/pairwise.py
@@ -0,0 +1,118 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Optional
+
+from ...extras import logging
+from ...extras.constants import IGNORE_INDEX
+from .processor_utils import DatasetProcessor, infer_seqlen
+
+
+if TYPE_CHECKING:
+    from ..mm_plugin import AudioInput, ImageInput, VideoInput
+
+
+logger = logging.get_logger(__name__)
+
+
+class PairwiseDatasetProcessor(DatasetProcessor):
+    def _encode_data_example(
+        self,
+        prompt: list[dict[str, str]],
+        response: list[dict[str, str]],
+        system: Optional[str],
+        tools: Optional[str],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+    ) -> tuple[list[int], list[int], list[int], list[int]]:
+        chosen_messages = self.template.mm_plugin.process_messages(
+            prompt + [response[0]], images, videos, audios, self.processor
+        )
+        rejected_messages = self.template.mm_plugin.process_messages(
+            prompt + [response[1]], images, videos, audios, self.processor
+        )
+        prompt_ids, chosen_ids = self.template.encode_oneturn(self.tokenizer, chosen_messages, system, tools)
+        _, rejected_ids = self.template.encode_oneturn(self.tokenizer, rejected_messages, system, tools)
+
+        if self.template.efficient_eos:
+            chosen_ids += [self.tokenizer.eos_token_id]
+            rejected_ids += [self.tokenizer.eos_token_id]
+
+        prompt_ids, _ = self.template.mm_plugin.process_token_ids(
+            prompt_ids, None, images, videos, audios, self.tokenizer, self.processor
+        )
+        # consider the response is more important
+        source_len, target_len = infer_seqlen(
+            len(prompt_ids), max(len(chosen_ids), len(rejected_ids)), self.data_args.cutoff_len
+        )
+        prompt_ids = prompt_ids[:source_len]
+        chosen_ids = chosen_ids[:target_len]
+        rejected_ids = rejected_ids[:target_len]
+
+        chosen_input_ids = prompt_ids + chosen_ids
+        chosen_labels = [IGNORE_INDEX] * source_len + chosen_ids
+        rejected_input_ids = prompt_ids + rejected_ids
+        rejected_labels = [IGNORE_INDEX] * source_len + rejected_ids
+        return chosen_input_ids, chosen_labels, rejected_input_ids, rejected_labels
+
+    def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        # build input pairs with format `<bos> X`, `Y1 <eos>` and `Y2 <eos>`
+        model_inputs = defaultdict(list)
+        for i in range(len(examples["_prompt"])):
+            if len(examples["_prompt"][i]) % 2 != 1 or len(examples["_response"][i]) < 2:
+                logger.warning_rank0(
+                    "Dropped invalid example: {}".format(examples["_prompt"][i] + examples["_response"][i])
+                )
+                continue
+
+            chosen_input_ids, chosen_labels, rejected_input_ids, rejected_labels = self._encode_data_example(
+                prompt=examples["_prompt"][i],
+                response=examples["_response"][i],
+                system=examples["_system"][i],
+                tools=examples["_tools"][i],
+                images=examples["_images"][i] or [],
+                videos=examples["_videos"][i] or [],
+                audios=examples["_audios"][i] or [],
+            )
+            model_inputs["chosen_input_ids"].append(chosen_input_ids)
+            model_inputs["chosen_attention_mask"].append([1] * len(chosen_input_ids))
+            model_inputs["chosen_labels"].append(chosen_labels)
+            model_inputs["rejected_input_ids"].append(rejected_input_ids)
+            model_inputs["rejected_attention_mask"].append([1] * len(rejected_input_ids))
+            model_inputs["rejected_labels"].append(rejected_labels)
+            model_inputs["images"].append(examples["_images"][i])
+            model_inputs["videos"].append(examples["_videos"][i])
+            model_inputs["audios"].append(examples["_audios"][i])
+
+        return model_inputs
+
+    def print_data_example(self, example: dict[str, list[int]]) -> None:
+        valid_chosen_labels = list(filter(lambda x: x != IGNORE_INDEX, example["chosen_labels"]))
+        valid_rejected_labels = list(filter(lambda x: x != IGNORE_INDEX, example["rejected_labels"]))
+        print("chosen_input_ids:\n{}".format(example["chosen_input_ids"]))
+        print(
+            "chosen_inputs:\n{}".format(self.tokenizer.decode(example["chosen_input_ids"], skip_special_tokens=False))
+        )
+        print("chosen_label_ids:\n{}".format(example["chosen_labels"]))
+        print(f"chosen_labels:\n{self.tokenizer.decode(valid_chosen_labels, skip_special_tokens=False)}")
+        print("rejected_input_ids:\n{}".format(example["rejected_input_ids"]))
+        print(
+            "rejected_inputs:\n{}".format(
+                self.tokenizer.decode(example["rejected_input_ids"], skip_special_tokens=False)
+            )
+        )
+        print("rejected_label_ids:\n{}".format(example["rejected_labels"]))
+        print(f"rejected_labels:\n{self.tokenizer.decode(valid_rejected_labels, skip_special_tokens=False)}")
diff --git a/LlamaFactory/src/llamafactory/data/processor/pretrain.py b/LlamaFactory/src/llamafactory/data/processor/pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fa6b1ca58a8d59493cd4b43c51cb268080cc506
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/processor/pretrain.py
@@ -0,0 +1,57 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from itertools import chain
+from typing import Any
+
+from .processor_utils import DatasetProcessor
+
+
+@dataclass
+class PretrainDatasetProcessor(DatasetProcessor):
+    def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        # build grouped texts with format `X1 X2 X3 ...` if packing is enabled
+        eos_token = "<|end_of_text|>" if self.data_args.template == "llama3" else self.tokenizer.eos_token
+        text_examples = [messages[0]["content"] + eos_token for messages in examples["_prompt"]]
+
+        if not self.data_args.packing:
+            if getattr(self.tokenizer, "add_bos_token", False):
+                text_examples = [self.tokenizer.bos_token + example for example in text_examples]
+
+            result = self.tokenizer(
+                text_examples, add_special_tokens=False, truncation=True, max_length=self.data_args.cutoff_len
+            )
+        else:
+            tokenized_examples = self.tokenizer(text_examples, add_special_tokens=False)
+            concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}
+            total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
+            block_size = self.data_args.cutoff_len
+            total_length = (total_length // block_size) * block_size
+            result = {
+                k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+                for k, t in concatenated_examples.items()
+            }
+            if getattr(self.tokenizer, "add_bos_token", False):
+                for i in range(len(result["input_ids"])):
+                    result["input_ids"][i][0] = self.tokenizer.bos_token_id
+
+        return result
+
+    def print_data_example(self, example: dict[str, list[int]]) -> None:
+        print("input_ids:\n{}".format(example["input_ids"]))
+        print("inputs:\n{}".format(self.tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
diff --git a/LlamaFactory/src/llamafactory/data/processor/processor_utils.py b/LlamaFactory/src/llamafactory/data/processor/processor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..db44b19cf6fc84d6551fb7cce82283774ae72030
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/processor/processor_utils.py
@@ -0,0 +1,88 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bisect
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, ProcessorMixin
+
+    from ...hparams import DataArguments
+    from ..template import Template
+
+
+@dataclass
+class DatasetProcessor(ABC):
+    r"""A class for data processors."""
+
+    template: "Template"
+    tokenizer: "PreTrainedTokenizer"
+    processor: Optional["ProcessorMixin"]
+    data_args: "DataArguments"
+
+    @abstractmethod
+    def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        r"""Build model inputs from the examples."""
+        ...
+
+    @abstractmethod
+    def print_data_example(self, example: dict[str, list[int]]) -> None:
+        r"""Print a data example to stdout."""
+        ...
+
+
+def search_for_fit(numbers: list[int], capacity: int) -> int:
+    r"""Find the index of largest number that fits into the knapsack with the given capacity."""
+    index = bisect.bisect(numbers, capacity)
+    return -1 if index == 0 else (index - 1)
+
+
+def greedy_knapsack(numbers: list[int], capacity: int) -> list[list[int]]:
+    r"""Implement efficient greedy algorithm with binary search for the knapsack problem."""
+    numbers.sort()  # sort numbers in ascending order for binary search
+    knapsacks = []
+
+    while numbers:
+        current_knapsack = []
+        remaining_capacity = capacity
+
+        while True:
+            index = search_for_fit(numbers, remaining_capacity)
+            if index == -1:
+                break  # no more numbers fit in this knapsack
+
+            remaining_capacity -= numbers[index]  # update the remaining capacity
+            current_knapsack.append(numbers.pop(index))  # add the number to knapsack
+
+        knapsacks.append(current_knapsack)
+
+    return knapsacks
+
+
+def infer_seqlen(source_len: int, target_len: int, cutoff_len: int) -> tuple[int, int]:
+    r"""Compute the real sequence length after truncation by the cutoff_len."""
+    if target_len * 2 < cutoff_len:  # truncate source
+        max_target_len = cutoff_len
+    elif source_len * 2 < cutoff_len:  # truncate target
+        max_target_len = cutoff_len - source_len
+    else:  # truncate both
+        max_target_len = int(cutoff_len * (target_len / (source_len + target_len)))
+
+    new_target_len = min(max_target_len, target_len)
+    max_source_len = max(cutoff_len - new_target_len, 0)
+    new_source_len = min(max_source_len, source_len)
+    return new_source_len, new_target_len
diff --git a/LlamaFactory/src/llamafactory/data/processor/supervised.py b/LlamaFactory/src/llamafactory/data/processor/supervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5aba11b6535078f46bdf6aca743c6ae262e1fc6
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/processor/supervised.py
@@ -0,0 +1,203 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+from ...extras import logging
+from ...extras.constants import IGNORE_INDEX
+from .processor_utils import DatasetProcessor, greedy_knapsack, infer_seqlen
+
+
+if TYPE_CHECKING:
+    from ..mm_plugin import AudioInput, ImageInput, VideoInput
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class SupervisedDatasetProcessor(DatasetProcessor):
+    def _encode_data_example(
+        self,
+        prompt: list[dict[str, str]],
+        response: list[dict[str, str]],
+        system: Optional[str],
+        tools: Optional[str],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+    ) -> tuple[list[int], list[int]]:
+        messages = self.template.mm_plugin.process_messages(prompt + response, images, videos, audios, self.processor)
+        input_ids, labels = self.template.mm_plugin.process_token_ids(
+            [], [], images, videos, audios, self.tokenizer, self.processor
+        )
+        encoded_pairs = self.template.encode_multiturn(self.tokenizer, messages, system, tools)
+        total_length = len(input_ids) + (1 if self.template.efficient_eos else 0)
+        if self.data_args.mask_history:
+            encoded_pairs = encoded_pairs[::-1]  # high priority for last turns
+
+        for turn_idx, (source_ids, target_ids) in enumerate(encoded_pairs):
+            if total_length >= self.data_args.cutoff_len:
+                break
+
+            source_len, target_len = infer_seqlen(
+                len(source_ids), len(target_ids), self.data_args.cutoff_len - total_length
+            )
+            source_ids = source_ids[:source_len]
+            target_ids = target_ids[:target_len]
+            total_length += source_len + target_len
+
+            if self.data_args.train_on_prompt:
+                source_label = source_ids
+            elif self.template.efficient_eos and turn_idx != 0:
+                source_label = [self.tokenizer.eos_token_id] + [IGNORE_INDEX] * (source_len - 1)
+            else:
+                source_label = [IGNORE_INDEX] * source_len
+
+            if self.data_args.mask_history and turn_idx != 0:  # train on the last turn only
+                target_label = [IGNORE_INDEX] * target_len
+            else:
+                target_label = target_ids
+
+            if self.data_args.mask_history:  # reversed sequences
+                input_ids = source_ids + target_ids + input_ids
+                labels = source_label + target_label + labels
+            else:
+                input_ids += source_ids + target_ids
+                labels += source_label + target_label
+
+        if self.template.efficient_eos:
+            input_ids += [self.tokenizer.eos_token_id]
+            labels += [self.tokenizer.eos_token_id]
+
+        return input_ids, labels
+
+    def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
+        # for multiturn examples, we only mask the prompt part in each prompt-response pair.
+        model_inputs = defaultdict(list)
+        for i in range(len(examples["_prompt"])):
+            if len(examples["_prompt"][i]) % 2 != 1 or len(examples["_response"][i]) != 1:
+                logger.warning_rank0(
+                    "Dropped invalid example: {}".format(examples["_prompt"][i] + examples["_response"][i])
+                )
+                continue
+
+            input_ids, labels = self._encode_data_example(
+                prompt=examples["_prompt"][i],
+                response=examples["_response"][i],
+                system=examples["_system"][i],
+                tools=examples["_tools"][i],
+                images=examples["_images"][i] or [],
+                videos=examples["_videos"][i] or [],
+                audios=examples["_audios"][i] or [],
+            )
+            model_inputs["input_ids"].append(input_ids)
+            model_inputs["attention_mask"].append([1] * len(input_ids))
+            model_inputs["labels"].append(labels)
+            model_inputs["images"].append(examples["_images"][i])
+            model_inputs["videos"].append(examples["_videos"][i])
+            model_inputs["audios"].append(examples["_audios"][i])
+
+        return model_inputs
+
+    def print_data_example(self, example: dict[str, list[int]]) -> None:
+        valid_labels = list(filter(lambda x: x != IGNORE_INDEX, example["labels"]))
+        print("input_ids:\n{}".format(example["input_ids"]))
+        print("inputs:\n{}".format(self.tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
+        print("label_ids:\n{}".format(example["labels"]))
+        print(f"labels:\n{self.tokenizer.decode(valid_labels, skip_special_tokens=False)}")
+
+
+@dataclass
+class PackedSupervisedDatasetProcessor(SupervisedDatasetProcessor):
+    def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        # TODO: use `position_ids` to achieve packing
+        # build inputs with format `<bos> X1 Y1 <eos> <bos> X2 Y2 <eos>`
+        # and labels with format `<ignore> ... <ignore> Y1 <eos> <ignore> ... <ignore> Y2 <eos>`
+        valid_num = 0
+        batch_input_ids, batch_labels, batch_images, batch_videos, batch_audios = [], [], [], [], []
+        lengths = []
+        length2indexes = defaultdict(list)
+        for i in range(len(examples["_prompt"])):
+            if len(examples["_prompt"][i]) % 2 != 1 or len(examples["_response"][i]) != 1:
+                logger.warning_rank0(
+                    "Dropped invalid example: {}".format(examples["_prompt"][i] + examples["_response"][i])
+                )
+                continue
+
+            input_ids, labels = self._encode_data_example(
+                prompt=examples["_prompt"][i],
+                response=examples["_response"][i],
+                system=examples["_system"][i],
+                tools=examples["_tools"][i],
+                images=examples["_images"][i] or [],
+                videos=examples["_videos"][i] or [],
+                audios=examples["_audios"][i] or [],
+            )
+            length = len(input_ids)
+            if length > self.data_args.cutoff_len:
+                logger.warning_rank0(f"Dropped lengthy example with length {length} > {self.data_args.cutoff_len}.")
+            else:
+                lengths.append(length)
+                length2indexes[length].append(valid_num)
+                batch_input_ids.append(input_ids)
+                batch_labels.append(labels)
+                batch_images.append(examples["_images"][i] or [])
+                batch_videos.append(examples["_videos"][i] or [])
+                batch_audios.append(examples["_audios"][i] or [])
+                valid_num += 1
+
+        model_inputs = defaultdict(list)
+        knapsacks = greedy_knapsack(lengths, self.data_args.cutoff_len)
+        for knapsack in knapsacks:
+            packed_input_ids, packed_attention_masks, packed_position_ids, packed_labels = [], [], [], []
+            packed_images, packed_videos, packed_audios = [], [], []
+            for i, length in enumerate(knapsack):
+                index = length2indexes[length].pop()
+                packed_input_ids += batch_input_ids[index]
+                packed_position_ids += list(range(len(batch_input_ids[index])))  # NOTE: pad_to_multiple_of ignore this
+                packed_labels += batch_labels[index]
+                packed_images += batch_images[index]
+                packed_videos += batch_videos[index]
+                packed_audios += batch_audios[index]
+                if self.data_args.neat_packing:
+                    packed_attention_masks += [i + 1] * len(batch_input_ids[index])  # start from 1
+                else:
+                    packed_attention_masks += [1] * len(batch_input_ids[index])
+
+            if len(packed_input_ids) < self.data_args.cutoff_len + 1:  # avoid flash_attn drops attn mask
+                pad_length = self.data_args.cutoff_len - len(packed_input_ids) + 1
+                packed_input_ids += [self.tokenizer.pad_token_id] * pad_length
+                packed_position_ids += [0] * pad_length
+                packed_labels += [IGNORE_INDEX] * pad_length
+                if self.data_args.neat_packing:
+                    packed_attention_masks += [0] * pad_length
+                else:
+                    packed_attention_masks += [1] * pad_length  # more efficient flash_attn
+
+            if len(packed_input_ids) != self.data_args.cutoff_len + 1:
+                raise ValueError("The length of packed example should be identical to the cutoff length.")
+
+            model_inputs["input_ids"].append(packed_input_ids)
+            model_inputs["attention_mask"].append(packed_attention_masks)
+            model_inputs["position_ids"].append(packed_position_ids)
+            model_inputs["labels"].append(packed_labels)
+            model_inputs["images"].append(packed_images or None)
+            model_inputs["videos"].append(packed_videos or None)
+            model_inputs["audios"].append(packed_audios or None)
+
+        return model_inputs
diff --git a/LlamaFactory/src/llamafactory/data/processor/unsupervised.py b/LlamaFactory/src/llamafactory/data/processor/unsupervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..256174b6dd38696b5b180501102af40ff395d0a9
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/processor/unsupervised.py
@@ -0,0 +1,91 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Optional
+
+from ...extras import logging
+from ..data_utils import Role
+from .processor_utils import DatasetProcessor, infer_seqlen
+
+
+if TYPE_CHECKING:
+    from ..mm_plugin import AudioInput, ImageInput, VideoInput
+
+
+logger = logging.get_logger(__name__)
+
+
+class UnsupervisedDatasetProcessor(DatasetProcessor):
+    def _encode_data_example(
+        self,
+        prompt: list[dict[str, str]],
+        response: list[dict[str, str]],
+        system: Optional[str],
+        tools: Optional[str],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+    ) -> tuple[list[int], list[int]]:
+        if len(response) == 1:
+            messages = prompt + response
+        else:
+            messages = prompt + [{"role": Role.ASSISTANT.value, "content": ""}]
+
+        messages = self.template.mm_plugin.process_messages(messages, images, videos, audios, self.processor)
+        input_ids, labels = self.template.encode_oneturn(self.tokenizer, messages, system, tools)
+        if self.template.efficient_eos:
+            labels += [self.tokenizer.eos_token_id]
+
+        input_ids, _ = self.template.mm_plugin.process_token_ids(
+            input_ids, None, images, videos, audios, self.tokenizer, self.processor
+        )
+        source_len, target_len = infer_seqlen(len(input_ids), len(labels), self.data_args.cutoff_len)
+        input_ids = input_ids[:source_len]
+        labels = labels[:target_len]
+        return input_ids, labels
+
+    def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        # build inputs with format `<bos> X` and labels with format `Y <eos>`
+        model_inputs = defaultdict(list)
+        for i in range(len(examples["_prompt"])):
+            if len(examples["_prompt"][i]) % 2 != 1:
+                logger.warning_rank0(
+                    "Dropped invalid example: {}".format(examples["_prompt"][i] + examples["_response"][i])
+                )
+                continue
+
+            input_ids, labels = self._encode_data_example(
+                prompt=examples["_prompt"][i],
+                response=examples["_response"][i],
+                system=examples["_system"][i],
+                tools=examples["_tools"][i],
+                images=examples["_images"][i] or [],
+                videos=examples["_videos"][i] or [],
+                audios=examples["_audios"][i] or [],
+            )
+            model_inputs["input_ids"].append(input_ids)
+            model_inputs["attention_mask"].append([1] * len(input_ids))
+            model_inputs["labels"].append(labels)
+            model_inputs["images"].append(examples["_images"][i])
+            model_inputs["videos"].append(examples["_videos"][i])
+            model_inputs["audios"].append(examples["_audios"][i])
+
+        return model_inputs
+
+    def print_data_example(self, example: dict[str, list[int]]) -> None:
+        print("input_ids:\n{}".format(example["input_ids"]))
+        print("inputs:\n{}".format(self.tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
+        print("label_ids:\n{}".format(example["labels"]))
+        print("labels:\n{}".format(self.tokenizer.decode(example["labels"], skip_special_tokens=False)))
diff --git a/LlamaFactory/src/llamafactory/data/template.py b/LlamaFactory/src/llamafactory/data/template.py
new file mode 100644
index 0000000000000000000000000000000000000000..e83b2e90a60c439370ed5cad4a13846abc977bf5
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/template.py
@@ -0,0 +1,2175 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union
+
+from typing_extensions import override
+
+from ..extras import logging
+from .data_utils import Role
+from .formatter import EmptyFormatter, FunctionFormatter, StringFormatter, ToolFormatter
+from .mm_plugin import get_mm_plugin
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+    from ..hparams import DataArguments
+    from .formatter import SLOTS, Formatter
+    from .mm_plugin import BasePlugin
+    from .tool_utils import FunctionCall
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class Template:
+    format_user: "Formatter"
+    format_assistant: "Formatter"
+    format_system: "Formatter"
+    format_function: "Formatter"
+    format_observation: "Formatter"
+    format_tools: "Formatter"
+    format_prefix: "Formatter"
+    default_system: str
+    stop_words: list[str]
+    thought_words: tuple[str, str]
+    tool_call_words: tuple[str, str]
+    efficient_eos: bool
+    replace_eos: bool
+    replace_jinja_template: bool
+    enable_thinking: Optional[bool]
+    mm_plugin: "BasePlugin"
+
+    def encode_oneturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> tuple[list[int], list[int]]:
+        r"""Return a single pair of token ids representing prompt and response respectively."""
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        prompt_ids = []
+        for encoded_ids in encoded_messages[:-1]:
+            prompt_ids += encoded_ids
+
+        response_ids = encoded_messages[-1]
+        return prompt_ids, response_ids
+
+    def encode_multiturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> list[tuple[list[int], list[int]]]:
+        r"""Return multiple pairs of token ids representing prompts and responses respectively."""
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        return [(encoded_messages[i], encoded_messages[i + 1]) for i in range(0, len(encoded_messages), 2)]
+
+    def extract_tool(self, content: str) -> Union[str, list["FunctionCall"]]:
+        r"""Extract tool message."""
+        return self.format_tools.extract(content)
+
+    def get_stop_token_ids(self, tokenizer: "PreTrainedTokenizer") -> list[int]:
+        r"""Return stop token ids."""
+        stop_token_ids = {tokenizer.eos_token_id}
+        for token in self.stop_words:
+            stop_token_ids.add(tokenizer.convert_tokens_to_ids(token))
+
+        return list(stop_token_ids)
+
+    def add_thought(self, content: str = "") -> str:
+        r"""Add empty thought to assistant message."""
+        return f"{self.thought_words[0]}{self.thought_words[1]}" + content
+
+    def remove_thought(self, content: str) -> str:
+        r"""Remove thought from assistant message."""
+        pattern = re.compile(f"{re.escape(self.thought_words[0])}(.*?){re.escape(self.thought_words[1])}", re.DOTALL)
+        return re.sub(pattern, "", content).lstrip("\n")
+
+    def get_thought_word_ids(self, tokenizer: "PreTrainedTokenizer") -> list[int]:
+        r"""Get the token ids of thought words."""
+        return tokenizer.encode(self.add_thought(), add_special_tokens=False)
+
+    def _convert_elements_to_ids(self, tokenizer: "PreTrainedTokenizer", elements: "SLOTS") -> list[int]:
+        r"""Convert elements to token ids."""
+        token_ids = []
+        for elem in elements:
+            if isinstance(elem, str):
+                if len(elem) != 0:
+                    token_ids += tokenizer.encode(elem, add_special_tokens=False)
+            elif isinstance(elem, dict):
+                token_ids += [tokenizer.convert_tokens_to_ids(elem.get("token"))]
+            elif isinstance(elem, set):
+                if "bos_token" in elem and tokenizer.bos_token_id is not None:
+                    token_ids += [tokenizer.bos_token_id]
+                elif "eos_token" in elem and tokenizer.eos_token_id is not None:
+                    token_ids += [tokenizer.eos_token_id]
+            else:
+                raise ValueError(f"Input must be string, set[str] or dict[str, str], got {type(elem)}")
+
+        return token_ids
+
+    def _encode(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str],
+        tools: Optional[str],
+    ) -> list[list[int]]:
+        r"""Encode formatted inputs to pairs of token ids.
+
+        Turn 0: prefix + system + query        resp
+        Turn t: query                          resp.
+        """
+        system = system or self.default_system
+        encoded_messages = []
+        for i, message in enumerate(messages):
+            elements = []
+
+            if i == 0:
+                elements += self.format_prefix.apply()
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    elements += self.format_system.apply(content=(system + tool_text))
+
+            if message["role"] == Role.USER:
+                elements += self.format_user.apply(content=message["content"], idx=str(i // 2))
+            elif message["role"] == Role.ASSISTANT:
+                elements += self.format_assistant.apply(content=message["content"])
+            elif message["role"] == Role.OBSERVATION:
+                elements += self.format_observation.apply(content=message["content"])
+            elif message["role"] == Role.FUNCTION:
+                elements += self.format_function.apply(
+                    content=message["content"], thought_words=self.thought_words, tool_call_words=self.tool_call_words
+                )
+            else:
+                raise NotImplementedError("Unexpected role: {}".format(message["role"]))
+
+            encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
+
+        return encoded_messages
+
+    @staticmethod
+    def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_token: str) -> None:
+        r"""Add or replace eos token to the tokenizer."""
+        if tokenizer.eos_token == eos_token:
+            return
+
+        is_added = tokenizer.eos_token_id is None
+        num_added_tokens = tokenizer.add_special_tokens({"eos_token": eos_token})
+
+        if is_added:
+            logger.info_rank0(f"Add eos token: {tokenizer.eos_token}.")
+        else:
+            logger.info_rank0(f"Replace eos token: {tokenizer.eos_token}.")
+
+        if num_added_tokens > 0:
+            logger.warning_rank0("New tokens have been added, make sure `resize_vocab` is True.")
+
+    def fix_special_tokens(self, tokenizer: "PreTrainedTokenizer") -> None:
+        r"""Add eos token and pad token to the tokenizer."""
+        stop_words = self.stop_words
+        if self.replace_eos:
+            if not stop_words:
+                raise ValueError("Stop words are required to replace the EOS token.")
+
+            self._add_or_replace_eos_token(tokenizer, eos_token=stop_words[0])
+            stop_words = stop_words[1:]
+
+        if tokenizer.eos_token_id is None:
+            self._add_or_replace_eos_token(tokenizer, eos_token="<|endoftext|>")
+
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token = tokenizer.eos_token
+            logger.info_rank0(f"Add pad token: {tokenizer.pad_token}")
+
+        if stop_words:
+            try:
+                num_added_tokens = tokenizer.add_special_tokens(
+                    dict(additional_special_tokens=stop_words), replace_additional_special_tokens=False
+                )
+            except TypeError:
+                num_added_tokens = tokenizer.add_special_tokens(dict(additional_special_tokens=stop_words))
+            logger.info_rank0("Add {} to stop words.".format(",".join(stop_words)))
+            if num_added_tokens > 0:
+                logger.warning_rank0("New tokens have been added, make sure `resize_vocab` is True.")
+
+    @staticmethod
+    def _jinja_escape(content: str) -> str:
+        r"""Escape single quotes in content."""
+        return content.replace("'", r"\'")
+
+    @staticmethod
+    def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content") -> str:
+        r"""Convert slots to jinja template."""
+        slot_items = []
+        for slot in slots:
+            if isinstance(slot, str):
+                slot_pieces = slot.split("{{content}}")
+                if slot_pieces[0]:
+                    slot_items.append("'" + Template._jinja_escape(slot_pieces[0]) + "'")
+                if len(slot_pieces) > 1:
+                    slot_items.append(placeholder)
+                    if slot_pieces[1]:
+                        slot_items.append("'" + Template._jinja_escape(slot_pieces[1]) + "'")
+            elif isinstance(slot, set):  # do not use {{ eos_token }} since it may be replaced
+                if "bos_token" in slot and tokenizer.bos_token_id is not None:
+                    slot_items.append("'" + tokenizer.bos_token + "'")
+                elif "eos_token" in slot and tokenizer.eos_token_id is not None:
+                    slot_items.append("'" + tokenizer.eos_token + "'")
+            elif isinstance(slot, dict):
+                raise ValueError("Dict is not supported.")
+
+        return " + ".join(slot_items)
+
+    def _get_jinja_template(self, tokenizer: "PreTrainedTokenizer") -> str:
+        r"""Return the jinja template."""
+        prefix = self._convert_slots_to_jinja(self.format_prefix.apply(), tokenizer)
+        system = self._convert_slots_to_jinja(self.format_system.apply(), tokenizer, placeholder="system_message")
+        user = self._convert_slots_to_jinja(self.format_user.apply(), tokenizer)
+        assistant = self._convert_slots_to_jinja(self.format_assistant.apply(), tokenizer)
+        jinja_template = ""
+        if prefix:
+            jinja_template += "{{ " + prefix + " }}"
+
+        if self.default_system:
+            jinja_template += "{% set system_message = '" + self._jinja_escape(self.default_system) + "' %}"
+
+        jinja_template += (
+            "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}"
+            "{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}"
+            "{% if system_message is defined %}{{ " + system + " }}{% endif %}"
+            "{% for message in loop_messages %}"
+            "{% set content = message['content'] %}"
+            "{% if message['role'] == 'user' %}"
+            "{{ " + user + " }}"
+            "{% elif message['role'] == 'assistant' %}"
+            "{{ " + assistant + " }}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        return jinja_template
+
+    def fix_jinja_template(self, tokenizer: "PreTrainedTokenizer") -> None:
+        r"""Replace the jinja template in the tokenizer."""
+        if tokenizer.chat_template is None or self.replace_jinja_template:
+            try:
+                tokenizer.chat_template = self._get_jinja_template(tokenizer)
+            except ValueError as e:
+                logger.info_rank0(f"Cannot add this chat template to tokenizer: {e}.")
+
+    @staticmethod
+    def _convert_slots_to_ollama(
+        slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content"
+    ) -> str:
+        r"""Convert slots to ollama template."""
+        slot_items = []
+        for slot in slots:
+            if isinstance(slot, str):
+                slot_pieces = slot.split("{{content}}")
+                if slot_pieces[0]:
+                    slot_items.append(slot_pieces[0])
+                if len(slot_pieces) > 1:
+                    slot_items.append("{{ " + placeholder + " }}")
+                    if slot_pieces[1]:
+                        slot_items.append(slot_pieces[1])
+            elif isinstance(slot, set):  # do not use {{ eos_token }} since it may be replaced
+                if "bos_token" in slot and tokenizer.bos_token_id is not None:
+                    slot_items.append(tokenizer.bos_token)
+                elif "eos_token" in slot and tokenizer.eos_token_id is not None:
+                    slot_items.append(tokenizer.eos_token)
+            elif isinstance(slot, dict):
+                raise ValueError("Dict is not supported.")
+
+        return "".join(slot_items)
+
+    def _get_ollama_template(self, tokenizer: "PreTrainedTokenizer") -> str:
+        r"""Return the ollama template."""
+        prefix = self._convert_slots_to_ollama(self.format_prefix.apply(), tokenizer)
+        system = self._convert_slots_to_ollama(self.format_system.apply(), tokenizer, placeholder=".System")
+        user = self._convert_slots_to_ollama(self.format_user.apply(), tokenizer, placeholder=".Content")
+        assistant = self._convert_slots_to_ollama(self.format_assistant.apply(), tokenizer, placeholder=".Content")
+        return (
+            f"{prefix}{{{{ if .System }}}}{system}{{{{ end }}}}"
+            f"""{{{{ range .Messages }}}}{{{{ if eq .Role "user" }}}}{user}"""
+            f"""{{{{ else if eq .Role "assistant" }}}}{assistant}{{{{ end }}}}{{{{ end }}}}"""
+        )
+
+    def get_ollama_modelfile(self, tokenizer: "PreTrainedTokenizer") -> str:
+        r"""Return the ollama modelfile.
+
+        TODO: support function calling.
+        """
+        modelfile = "# ollama modelfile auto-generated by llamafactory\n\n"
+        modelfile += f'FROM .\n\nTEMPLATE """{self._get_ollama_template(tokenizer)}"""\n\n'
+
+        if self.default_system:
+            modelfile += f'SYSTEM """{self.default_system}"""\n\n'
+
+        for stop_token_id in self.get_stop_token_ids(tokenizer):
+            modelfile += f'PARAMETER stop "{tokenizer.convert_ids_to_tokens(stop_token_id)}"\n'
+
+        modelfile += "PARAMETER num_ctx 4096\n"
+        return modelfile
+
+
+@dataclass
+class Llama2Template(Template):
+    r"""A template that fuse the system message to first user message."""
+
+    @override
+    def _encode(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: str,
+        tools: str,
+    ) -> list[list[int]]:
+        system = system or self.default_system
+        encoded_messages = []
+        for i, message in enumerate(messages):
+            elements = []
+
+            system_text = ""
+            if i == 0:
+                elements += self.format_prefix.apply()
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    system_text = self.format_system.apply(content=(system + tool_text))[0]
+
+            if message["role"] == Role.USER:
+                elements += self.format_user.apply(content=system_text + message["content"])
+            elif message["role"] == Role.ASSISTANT:
+                elements += self.format_assistant.apply(content=message["content"])
+            elif message["role"] == Role.OBSERVATION:
+                elements += self.format_observation.apply(content=message["content"])
+            elif message["role"] == Role.FUNCTION:
+                elements += self.format_function.apply(content=message["content"])
+            else:
+                raise NotImplementedError("Unexpected role: {}".format(message["role"]))
+
+            encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
+
+        return encoded_messages
+
+    def _get_jinja_template(self, tokenizer: "PreTrainedTokenizer") -> str:
+        prefix = self._convert_slots_to_jinja(self.format_prefix.apply(), tokenizer)
+        system_message = self._convert_slots_to_jinja(
+            self.format_system.apply(), tokenizer, placeholder="system_message"
+        )
+        user_message = self._convert_slots_to_jinja(self.format_user.apply(), tokenizer)
+        assistant_message = self._convert_slots_to_jinja(self.format_assistant.apply(), tokenizer)
+        jinja_template = ""
+        if prefix:
+            jinja_template += "{{ " + prefix + " }}"
+
+        if self.default_system:
+            jinja_template += "{% set system_message = '" + self._jinja_escape(self.default_system) + "' %}"
+
+        jinja_template += (
+            "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}"
+            "{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}"
+            "{% for message in loop_messages %}"
+            "{% if loop.index0 == 0 and system_message is defined %}"
+            "{% set content = " + system_message + " + message['content'] %}"
+            "{% else %}{% set content = message['content'] %}{% endif %}"
+            "{% if message['role'] == 'user' %}"
+            "{{ " + user_message + " }}"
+            "{% elif message['role'] == 'assistant' %}"
+            "{{ " + assistant_message + " }}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        return jinja_template
+
+
+@dataclass
+class ReasoningTemplate(Template):
+    r"""A template that add thought to assistant message."""
+
+    @override
+    def encode_oneturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> tuple[list[int], list[int]]:
+        messages = deepcopy(messages)
+        for i in range(1, len(messages) - 2, 2):
+            messages[i]["content"] = self.remove_thought(messages[i]["content"])
+
+        if self.enable_thinking is False:  # remove all cot
+            messages[-1]["content"] = self.remove_thought(messages[-1]["content"])
+
+        prompt_ids, response_ids = super().encode_oneturn(tokenizer, messages, system, tools)
+        if (
+            self.thought_words[0].strip() not in messages[-1]["content"]
+            and self.thought_words[1].strip() not in messages[-1]["content"]
+        ):  # add empty cot
+            if not self.enable_thinking:  # do not compute loss
+                prompt_ids += self.get_thought_word_ids(tokenizer)
+            else:  # do compute loss
+                response_ids = self.get_thought_word_ids(tokenizer) + response_ids
+
+        return prompt_ids, response_ids
+
+    @override
+    def encode_multiturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> list[tuple[list[int], list[int]]]:
+        messages = deepcopy(messages)
+        if self.enable_thinking is False:  # remove all cot
+            for i in range(1, len(messages), 2):
+                messages[i]["content"] = self.remove_thought(messages[i]["content"])
+
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        for i in range(0, len(messages), 2):
+            if (
+                self.thought_words[0].strip() not in messages[i + 1]["content"]
+                and self.thought_words[1].strip() not in messages[i + 1]["content"]
+            ):  # add empty cot
+                if not self.enable_thinking:  # do not compute loss
+                    encoded_messages[i] += self.get_thought_word_ids(tokenizer)
+                else:  # do compute loss
+                    encoded_messages[i + 1] = self.get_thought_word_ids(tokenizer) + encoded_messages[i + 1]
+
+        return [(encoded_messages[i], encoded_messages[i + 1]) for i in range(0, len(encoded_messages), 2)]
+
+
+TEMPLATES: dict[str, "Template"] = {}
+
+
+def register_template(
+    name: str,
+    format_user: Optional["Formatter"] = None,
+    format_assistant: Optional["Formatter"] = None,
+    format_system: Optional["Formatter"] = None,
+    format_function: Optional["Formatter"] = None,
+    format_observation: Optional["Formatter"] = None,
+    format_tools: Optional["Formatter"] = None,
+    format_prefix: Optional["Formatter"] = None,
+    default_system: str = "",
+    stop_words: Optional[list[str]] = None,
+    thought_words: Optional[tuple[str, str]] = None,
+    tool_call_words: Optional[tuple[str, str]] = None,
+    efficient_eos: bool = False,
+    replace_eos: bool = False,
+    replace_jinja_template: bool = False,
+    enable_thinking: Optional[bool] = True,
+    mm_plugin: "BasePlugin" = get_mm_plugin(name="base"),
+    template_class: type["Template"] = Template,
+) -> None:
+    r"""Register a chat template.
+
+    To add the following chat template:
+    ```
+    <s><user>user prompt here
+    <model>model response here</s>
+    <user>user prompt here
+    <model>model response here</s>
+    ```
+
+    The corresponding code should be:
+    ```
+    register_template(
+        name="custom",
+        format_user=StringFormatter(slots=["<user>{{content}}\n<model>"]),
+        format_assistant=StringFormatter(slots=["{{content}}</s>\n"]),
+        format_prefix=EmptyFormatter("<s>"),
+    )
+    ```
+    """
+    if name in TEMPLATES:
+        raise ValueError(f"Template {name} already exists.")
+
+    default_slots = ["{{content}}"] if efficient_eos else ["{{content}}", {"eos_token"}]
+    default_user_formatter = StringFormatter(slots=["{{content}}"])
+    default_assistant_formatter = StringFormatter(slots=default_slots)
+    if format_assistant is not None:
+        default_function_formatter = FunctionFormatter(slots=format_assistant.slots, tool_format="default")
+    else:
+        default_function_formatter = FunctionFormatter(slots=default_slots, tool_format="default")
+
+    default_tool_formatter = ToolFormatter(tool_format="default")
+    default_prefix_formatter = EmptyFormatter()
+    TEMPLATES[name] = template_class(
+        format_user=format_user or default_user_formatter,
+        format_assistant=format_assistant or default_assistant_formatter,
+        format_system=format_system or default_user_formatter,
+        format_function=format_function or default_function_formatter,
+        format_observation=format_observation or format_user or default_user_formatter,
+        format_tools=format_tools or default_tool_formatter,
+        format_prefix=format_prefix or default_prefix_formatter,
+        default_system=default_system,
+        stop_words=stop_words or [],
+        thought_words=thought_words or ("<think>\n", "\n</think>\n\n"),
+        tool_call_words=tool_call_words or ("<tool_call>", "</tool_call>"),
+        efficient_eos=efficient_eos,
+        replace_eos=replace_eos,
+        replace_jinja_template=replace_jinja_template,
+        enable_thinking=enable_thinking,
+        mm_plugin=mm_plugin,
+    )
+
+
+def parse_template(tokenizer: "PreTrainedTokenizer") -> "Template":
+    r"""Extract a chat template from the tokenizer."""
+
+    def find_diff(short_str: str, long_str: str) -> str:
+        i, j = 0, 0
+        diff = ""
+        while i < len(short_str) and j < len(long_str):
+            if short_str[i] == long_str[j]:
+                i += 1
+                j += 1
+            else:
+                diff += long_str[j]
+                j += 1
+
+        return diff
+
+    prefix = tokenizer.decode(tokenizer.encode(""))
+
+    messages = [{"role": "system", "content": "{{content}}"}]
+    system_slot = tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)[len(prefix) :]
+
+    messages = [{"role": "system", "content": ""}, {"role": "user", "content": "{{content}}"}]
+    user_slot_empty_system = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+    user_slot_empty_system = user_slot_empty_system[len(prefix) :]
+
+    messages = [{"role": "user", "content": "{{content}}"}]
+    user_slot = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+    user_slot = user_slot[len(prefix) :]
+
+    messages = [{"role": "user", "content": "{{content}}"}, {"role": "assistant", "content": "{{content}}"}]
+    assistant_slot = tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)
+    assistant_slot = assistant_slot[len(prefix) + len(user_slot) :]
+    template_class = ReasoningTemplate if "<think>" in assistant_slot else Template
+    assistant_slot = assistant_slot.replace("<think>", "").replace("</think>", "").lstrip("\n")  # remove thought tags
+
+    if len(user_slot) > len(user_slot_empty_system):
+        default_system = find_diff(user_slot_empty_system, user_slot)
+        sole_system = system_slot.replace("{{content}}", default_system, 1)
+        user_slot = user_slot[len(sole_system) :]
+    else:  # if defaut_system is empty, user_slot_empty_system will be longer than user_slot
+        default_system = ""
+
+    return template_class(
+        format_user=StringFormatter(slots=[user_slot]),
+        format_assistant=StringFormatter(slots=[assistant_slot]),
+        format_system=StringFormatter(slots=[system_slot]),
+        format_function=FunctionFormatter(slots=[assistant_slot], tool_format="default"),
+        format_observation=StringFormatter(slots=[user_slot]),
+        format_tools=ToolFormatter(tool_format="default"),
+        format_prefix=EmptyFormatter(slots=[prefix]) if prefix else EmptyFormatter(),
+        default_system=default_system,
+        stop_words=[],
+        thought_words=("<think>\n", "\n</think>\n\n"),
+        tool_call_words=("<tool_call>", "</tool_call>"),
+        efficient_eos=False,
+        replace_eos=False,
+        replace_jinja_template=False,
+        enable_thinking=True,
+        mm_plugin=get_mm_plugin(name="base"),
+    )
+
+
+def get_template_and_fix_tokenizer(tokenizer: "PreTrainedTokenizer", data_args: "DataArguments") -> "Template":
+    r"""Get chat template and fixes the tokenizer."""
+    if data_args.template is None:
+        if isinstance(tokenizer.chat_template, str):
+            logger.warning_rank0("`template` was not specified, try parsing the chat template from the tokenizer.")
+            template = parse_template(tokenizer)
+        else:
+            logger.warning_rank0("`template` was not specified, use `empty` template.")
+            template = TEMPLATES["empty"]  # placeholder
+    else:
+        if data_args.template not in TEMPLATES:
+            raise ValueError(f"Template {data_args.template} does not exist.")
+
+        template = TEMPLATES[data_args.template]
+
+    if data_args.train_on_prompt and template.efficient_eos:
+        raise ValueError("Current template does not support `train_on_prompt`.")
+
+    if data_args.tool_format is not None:
+        logger.info_rank0(f"Using tool format: {data_args.tool_format}.")
+        default_slots = ["{{content}}"] if template.efficient_eos else ["{{content}}", {"eos_token"}]
+        template.format_function = FunctionFormatter(slots=default_slots, tool_format=data_args.tool_format)
+        template.format_tools = ToolFormatter(tool_format=data_args.tool_format)
+
+    if data_args.default_system is not None:
+        logger.info_rank0(f"Using default system message: {data_args.default_system}.")
+        template.default_system = data_args.default_system
+
+    if isinstance(template, ReasoningTemplate):
+        logger.warning_rank0(
+            "You are using reasoning template, "
+            "please add `_nothink` suffix if the model is not a reasoning model. "
+            "e.g., qwen3_vl_nothink"
+        )
+        template.enable_thinking = data_args.enable_thinking
+
+    template.fix_special_tokens(tokenizer)
+    template.fix_jinja_template(tokenizer)
+    return template
+
+
+register_template(
+    name="alpaca",
+    format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}, "\n\n"]),
+    default_system=(
+        "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
+    ),
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="bailing",
+    format_user=StringFormatter(slots=["<role>HUMAN</role>{{content}}<role>ASSISTANT</role>"]),
+    format_system=StringFormatter(slots=["<role>SYSTEM</role>{{content}}"]),
+    format_observation=StringFormatter(slots=["<role>OBSERVATION</role>{{content}}<role>ASSISTANT</role>"]),
+    stop_words=["<|endoftext|>"],
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="bailing_v2",
+    format_user=StringFormatter(slots=["<role>HUMAN</role>{{content}}<|role_end|><role>ASSISTANT</role>"]),
+    format_system=StringFormatter(slots=["<role>SYSTEM</role>{{content}}<|role_end|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|role_end|>"]),
+    format_observation=StringFormatter(
+        slots=[
+            "<role>OBSERVATION</role>\n<tool_response>\n{{content}}\n</tool_response><|role_end|><role>ASSISTANT</role>"
+        ]
+    ),
+    format_function=FunctionFormatter(slots=["{{content}}<|role_end|>"], tool_format="ling"),
+    format_tools=ToolFormatter(tool_format="ling"),
+    stop_words=["<|endoftext|>"],
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="breeze",
+    format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="chatglm3",
+    format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
+    format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
+    format_system=StringFormatter(slots=[{"token": "<|system|>"}, "\n", "{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4"),
+    format_observation=StringFormatter(
+        slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
+    ),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="chatml",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    stop_words=["<|im_end|>", "<|im_start|>"],
+    replace_eos=True,
+    replace_jinja_template=True,
+)
+
+
+# copied from chatml template
+register_template(
+    name="chatml_de",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
+    stop_words=["<|im_end|>", "<|im_start|>"],
+    replace_eos=True,
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="cohere",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>"
+                "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+            )
+        ]
+    ),
+    format_system=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+# copied from chatml template
+register_template(
+    name="cpm4",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|im_end|>"],
+)
+
+
+# copied from chatml template
+register_template(
+    name="dbrx",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    default_system=(
+        "You are DBRX, created by Databricks. You were last updated in December 2023. "
+        "You answer questions based on information available up to that point.\n"
+        "YOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough "
+        "responses to more complex and open-ended questions.\nYou assist with various tasks, "
+        "from writing to coding (using markdown for code blocks — remember to use ``` with "
+        "code, JSON, and tables).\n(You do not have real-time data access or code execution "
+        "capabilities. You avoid stereotyping and provide balanced perspectives on "
+        "controversial topics. You do not provide song lyrics, poems, or news articles and "
+        "do not divulge details of your training data.)\nThis is your system prompt, "
+        "guiding your responses. Do not reference it, just respond to the user. If you find "
+        "yourself talking about this message, stop. You should be responding appropriately "
+        "and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION "
+        "ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY."
+    ),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+register_template(
+    name="deepseek",
+    format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+register_template(
+    name="deepseek3",
+    format_user=StringFormatter(slots=["<｜User｜>{{content}}<｜Assistant｜>"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+# copied from deepseek3 template
+register_template(
+    name="deepseekr1",
+    format_user=StringFormatter(slots=["<｜User｜>{{content}}<｜Assistant｜>"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="deepseekcoder",
+    format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}\n<|EOT|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    default_system=(
+        "You are an AI programming assistant, utilizing the DeepSeek Coder model, "
+        "developed by DeepSeek Company, and you only answer questions related to computer science. "
+        "For politically sensitive questions, security and privacy issues, "
+        "and other non-computer science questions, you will refuse to answer.\n"
+    ),
+)
+
+
+register_template(
+    name="default",
+    format_user=StringFormatter(slots=["Human: {{content}}", {"eos_token"}, "\nAssistant:"]),
+    format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}, "\n"]),
+    format_system=StringFormatter(slots=["System: {{content}}", {"eos_token"}, "\n"]),
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="dots_ocr",
+    format_user=StringFormatter(slots=["<|user|>{{content}}<|endofuser|><|assistant|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|endofassistant|>"]),
+    format_system=StringFormatter(slots=["<|system|>{{content}}<|endofsystem|>\n"]),
+    stop_words=["<|endofassistant|>"],
+    efficient_eos=True,
+    mm_plugin=get_mm_plugin(
+        name="qwen2_vl",
+        image_token="<|imgpad|>",
+        video_token="<|vidpad|>",
+        vision_bos_token="<|img|>",
+        vision_eos_token="<|endofimg|>",
+    ),
+)
+
+
+register_template(
+    name="empty",
+    format_assistant=StringFormatter(slots=["{{content}}"]),
+)
+
+
+# copied from chatml template
+register_template(
+    name="ernie",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n\n<|im_start|>assistant\n"]),
+    default_system="<global_setting>\nthink_mode=True\n</global_setting>",
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="ernie_nothink",
+    format_user=StringFormatter(slots=["User: {{content}}\nAssistant: "]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end_of_sentence|>"]),
+    format_system=StringFormatter(slots=["{{content}}\n"]),
+    format_prefix=EmptyFormatter(slots=["<|begin_of_sentence|>"]),
+    stop_words=["<|end_of_sentence|>"],
+)
+
+
+register_template(
+    name="ernie_vl",
+    format_user=StringFormatter(slots=["User: {{content}}"]),
+    format_assistant=StringFormatter(slots=["\nAssistant: {{content}}<|end_of_sentence|>"]),
+    format_system=StringFormatter(slots=["{{content}}\n"]),
+    stop_words=["<|end_of_sentence|>"],
+    replace_eos=True,
+    replace_jinja_template=True,
+    template_class=ReasoningTemplate,
+    mm_plugin=get_mm_plugin(name="ernie_vl", image_token="<|IMAGE_PLACEHOLDER|>", video_token="<|VIDEO_PLACEHOLDER|>"),
+)
+
+
+register_template(
+    name="exaone",
+    format_user=StringFormatter(slots=["[|user|]{{content}}\n[|assistant|]"]),
+    format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}, "\n"]),
+    format_system=StringFormatter(slots=["[|system|]{{content}}[|endofturn|]\n"]),
+)
+
+
+register_template(
+    name="falcon",
+    format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]),
+    format_assistant=StringFormatter(slots=["{{content}}\n"]),
+    efficient_eos=True,
+)
+
+
+# copied from chatml template
+register_template(
+    name="falcon_h1",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|im_end|>", "<|end_of_text|>"],
+)
+
+
+register_template(
+    name="fewshot",
+    format_assistant=StringFormatter(slots=["{{content}}\n\n"]),
+    efficient_eos=True,
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="gemma",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<end_of_turn>"],
+    replace_eos=True,
+    template_class=Llama2Template,
+)
+
+
+# copied from gemma template
+register_template(
+    name="gemma2",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<eos>", "<end_of_turn>"],
+    efficient_eos=True,
+    template_class=Llama2Template,
+)
+
+
+# copied from gemma template
+register_template(
+    name="gemma3",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<end_of_turn>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin("gemma3", image_token="<image_soft_token>"),
+    template_class=Llama2Template,
+)
+
+
+register_template(
+    name="gemma3n",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<end_of_turn>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin("gemma3n", image_token="<image_soft_token>", audio_token="<audio_soft_token>"),
+    template_class=Llama2Template,
+)
+
+
+register_template(
+    name="glm4",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+)
+
+
+# copied from glm4 template
+register_template(
+    name="glm4_moe",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4_moe"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4_moe"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from glm4 template
+register_template(
+    name="glm4v",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>", "</answer>"],
+    efficient_eos=True,
+    mm_plugin=get_mm_plugin(name="glm4v", image_token="<|image|>", video_token="<|video|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from glm4 template
+register_template(
+    name="glm4_5v",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4_moe"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4_moe"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>", "</answer>"],
+    efficient_eos=True,
+    mm_plugin=get_mm_plugin(name="glm4v", image_token="<|image|>", video_token="<|video|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from glm4 template
+register_template(
+    name="glmz1",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="gpt_oss",
+    format_user=StringFormatter(slots=["<|start|>user<|message|>{{content}}<|end|><|start|>assistant"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end|>"]),
+    format_system=StringFormatter(slots=["<|start|>system<|message|>{{content}}<|end|>"]),
+    default_system="You are ChatGPT, a large language model trained by OpenAI.",
+    thought_words=("<|channel|>analysis<|message|>", "<|end|><|start|>assistant<|channel|>final<|message|>"),
+    efficient_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="granite3",
+    format_user=StringFormatter(
+        slots=[
+            "<|start_of_role|>user<|end_of_role|>{{content}}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|end_of_text|>\n"]),
+    format_system=StringFormatter(slots=["<|start_of_role|>system<|end_of_role|>{{content}}<|end_of_text|>\n"]),
+)
+
+
+register_template(
+    name="granite3_vision",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}\n<|assistant|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}\n"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+register_template(
+    name="granite4",
+    format_user=StringFormatter(
+        slots=[
+            "<|start_of_role|>user<|end_of_role|>{{content}}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|end_of_text|>\n"]),
+    format_system=StringFormatter(slots=["<|start_of_role|>system<|end_of_role|>{{content}}<|end_of_text|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|end_of_text|>\n"], tool_format="default"),
+    format_observation=StringFormatter(
+        slots=["<|start_of_role|>tool<|end_of_role|>{{content}}<|end_of_text|>\n<|start_of_role|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="default"),
+    stop_words=["<|end_of_text|>"],
+    default_system="You are Granite, developed by IBM. You are a helpful AI assistant.",
+)
+
+
+register_template(
+    name="index",
+    format_user=StringFormatter(slots=["reserved_0{{content}}reserved_1"]),
+    format_system=StringFormatter(slots=["<unk>{{content}}"]),
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="hunyuan",
+    format_user=StringFormatter(slots=["{{content}}<|extra_0|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|eos|>"]),
+    format_system=StringFormatter(slots=["{{content}}<|extra_4|>"]),
+    format_prefix=EmptyFormatter(slots=["<|startoftext|>"]),
+    stop_words=["<|eos|>"],
+)
+
+
+register_template(
+    name="hunyuan_small",
+    format_user=StringFormatter(slots=["<｜hy_User｜>{{content}}<｜hy_place▁holder▁no▁8｜>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<｜hy_place▁holder▁no▁2｜>"]),
+    format_system=StringFormatter(slots=["{{content}}<｜hy_place▁holder▁no▁3｜>"]),
+    format_prefix=EmptyFormatter(slots=["<｜hy_begin▁of▁sentence｜>"]),
+    stop_words=["<｜hy_place▁holder▁no▁2｜>"],
+)
+
+
+register_template(
+    name="intern2",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    default_system=(
+        "You are an AI assistant whose name is InternLM (书生·浦语).\n"
+        "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory "
+        "(上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
+        "- InternLM (书生·浦语) can understand and communicate fluently in the language "
+        "chosen by the user such as English and 中文."
+    ),
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="intern_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    default_system=(
+        "你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。"
+    ),
+    stop_words=["<|im_end|>"],
+    mm_plugin=get_mm_plugin(name="intern_vl", image_token="<image>", video_token="<video>"),
+)
+
+
+register_template(
+    name="intern_s1",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|im_end|>"],
+    mm_plugin=get_mm_plugin(name="intern_vl", image_token="<image>", video_token="<video>"),
+)
+
+
+# copied from qwen template
+register_template(
+    name="keye_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="kimi_vl",
+    format_user=StringFormatter(
+        slots=["<|im_user|>user<|im_middle|>{{content}}<|im_end|><|im_assistant|>assistant<|im_middle|>"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>"]),
+    format_system=StringFormatter(slots=["<|im_system|>system<|im_middle|>{{content}}<|im_end|>"]),
+    default_system="You are a helpful assistant",
+    stop_words=["<|im_end|>"],
+    thought_words=("◁think▷", "◁/think▷"),
+    mm_plugin=get_mm_plugin("kimi_vl", image_token="<|media_pad|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="lfm2",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="lfm2"),
+    format_observation=StringFormatter(
+        slots=[
+            "<|im_start|>tool\n<|tool_response_start|>{{content}}<|tool_response_end|><|im_end|>\n"
+            "<|im_start|>assistant\n"
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="lfm2"),
+    default_system="You are a helpful AI assistant.",
+    stop_words=["<|im_end|>"],
+    tool_call_words=("<|tool_call_start|>", "<|tool_call_end|>"),
+    replace_eos=True,
+)
+
+
+register_template(
+    name="lfm2_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="lfm2"),
+    format_observation=StringFormatter(
+        slots=[
+            "<|im_start|>tool\n<|tool_response_start|>{{content}}<|tool_response_end|><|im_end|>\n"
+            "<|im_start|>assistant\n"
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="lfm2"),
+    default_system="You are a helpful multimodal assistant by Liquid AI.",
+    stop_words=["<|im_end|>"],
+    tool_call_words=("<|tool_call_start|>", "<|tool_call_end|>"),
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="lfm2_vl", image_token="<image>"),
+)
+
+
+register_template(
+    name="llama2",
+    format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
+    format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
+    template_class=Llama2Template,
+)
+
+
+# copied from llama2 template
+register_template(
+    name="llama2_zh",
+    format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
+    format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
+    default_system="You are a helpful assistant. 你是一个乐于助人的助手。",
+    template_class=Llama2Template,
+)
+
+
+register_template(
+    name="llama3",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|eot_id|>"]),
+    format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|eot_id|>"], tool_format="llama3"),
+    format_observation=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>ipython<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="llama3"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>", "<|eom_id|>"],
+    replace_eos=True,
+)
+
+
+register_template(
+    name="llama4",
+    format_user=StringFormatter(
+        slots=["<|header_start|>user<|header_end|>\n\n{{content}}<|eot|><|header_start|>assistant<|header_end|>\n\n"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|eot|>"]),
+    format_system=StringFormatter(slots=["<|header_start|>system<|header_end|>\n\n{{content}}<|eot|>"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|eot|>"], tool_format="llama3"),
+    format_observation=StringFormatter(
+        slots=[
+            "<|header_start|>ipython<|header_end|>\n\n{{content}}<|eot|><|header_start|>assistant<|header_end|>\n\n"
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="llama3"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot|>", "<|eom|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="llama4", image_token="<|image|>"),
+)
+
+
+# copied from llama3 template
+register_template(
+    name="mllama",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|eot_id|>"]),
+    format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|eot_id|>"], tool_format="llama3"),
+    format_observation=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>ipython<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="llama3"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>", "<|eom_id|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="mllama", image_token="<|image|>"),
+)
+
+
+register_template(
+    name="moonlight",
+    format_user=StringFormatter(
+        slots=["<|im_user|>user<|im_middle|>{{content}}<|im_end|><|im_assistant|>assistant<|im_middle|>"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>"]),
+    format_system=StringFormatter(slots=["<|im_system|>system<|im_middle|>{{content}}<|im_end|>"]),
+    default_system="You are a helpful assistant provided by Moonshot-AI.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+# copied from vicuna template
+register_template(
+    name="llava",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava", image_token="<image>"),
+)
+
+
+# copied from vicuna template
+register_template(
+    name="llava_next",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+# copied from llama3 template
+register_template(
+    name="llava_next_llama3",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|eot_id|>"]),
+    format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|eot_id|>"], tool_format="llama3"),
+    format_observation=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>ipython<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="llama3"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>", "<|eom_id|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+# copied from mistral template
+register_template(
+    name="llava_next_mistral",
+    format_user=StringFormatter(slots=["[INST] {{content}}[/INST]"]),
+    format_assistant=StringFormatter(slots=[" {{content}}", {"eos_token"}]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS] {{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS] {"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+    template_class=Llama2Template,
+)
+
+
+# copied from qwen template
+register_template(
+    name="llava_next_qwen",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+# copied from chatml template
+register_template(
+    name="llava_next_yi",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+# copied from vicuna template
+register_template(
+    name="llava_next_video",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>"),
+)
+
+
+# copied from mistral template
+register_template(
+    name="llava_next_video_mistral",
+    format_user=StringFormatter(slots=["[INST] {{content}}[/INST]"]),
+    format_assistant=StringFormatter(slots=[" {{content}}", {"eos_token"}]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS] {{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS] {"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>"),
+    template_class=Llama2Template,
+)
+
+
+# copied from chatml template
+register_template(
+    name="llava_next_video_yi",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    mm_plugin=get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>"),
+)
+
+
+# copied from qwen template
+register_template(
+    name="mimo",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from qwen template
+register_template(
+    name="mimo_v2",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are MiMo, a helpful AI assistant engineered by Xiaomi.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    thought_words=("<think>", "</think>"),
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from qwen2vl
+register_template(
+    name="mimo_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are MiMo, an AI assistant developed by Xiaomi.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from chatml template
+register_template(
+    name="minicpm_v",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    default_system="You are a helpful assistant.",
+    mm_plugin=get_mm_plugin(name="minicpm_v", image_token="<image>", video_token="<video>"),
+)
+
+
+# copied from minicpm_v template
+register_template(
+    name="minicpm_o",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    default_system="You are a helpful assistant. You can accept audio and text input and output voice and text.",
+    mm_plugin=get_mm_plugin(name="minicpm_v", image_token="<image>", video_token="<video>", audio_token="<audio>"),
+)
+
+
+register_template(
+    name="minimax1",
+    format_user=StringFormatter(
+        slots=[
+            "<beginning_of_sentence>user name=user\n{{content}}<end_of_sentence>\n<beginning_of_sentence>ai name=assistant\n"
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_sentence>\n"]),
+    format_system=StringFormatter(
+        slots=["<beginning_of_sentence>system ai_setting=assistant\n{{content}}<end_of_sentence>\n"]
+    ),
+    format_function=FunctionFormatter(slots=["{{content}}<end_of_sentence>\n"], tool_format="minimax1"),
+    format_observation=StringFormatter(
+        slots=[
+            "<beginning_of_sentence>tool name=tools\n{{content}}<end_of_sentence>\n<beginning_of_sentence>ai name=assistant\n"
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="minimax1"),
+    default_system="You are a helpful assistant.",
+    stop_words=["<end_of_sentence>"],
+)
+
+
+register_template(
+    name="minimax2",
+    format_user=StringFormatter(slots=["]~b]user\n{{content}}[e~[\n]~b]ai\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}[e~[\n"]),
+    format_system=StringFormatter(slots=["]~!b[]~b]system\n{{content}}[e~[\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}[e~[\n"], tool_format="minimax2"),
+    format_observation=StringFormatter(slots=["]~b]tool\n<response>{{content}}</response>[e~[\n]~b]ai\n"]),
+    format_tools=ToolFormatter(tool_format="minimax2"),
+    default_system="You are a helpful assistant. Your name is MiniMax-M2.1 and is built by MiniMax.",
+    stop_words=["[e~["],
+    template_class=ReasoningTemplate,
+)
+
+
+# mistral tokenizer v3 tekken
+register_template(
+    name="ministral",
+    format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS]{{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS]{"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    template_class=Llama2Template,
+)
+
+
+# mistral tokenizer v3
+register_template(
+    name="mistral",
+    format_user=StringFormatter(slots=["[INST] {{content}}[/INST]"]),
+    format_assistant=StringFormatter(slots=[" {{content}}", {"eos_token"}]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS] {{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS] {"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    template_class=Llama2Template,
+)
+
+
+# mistral tokenizer v7 tekken (copied from ministral)
+register_template(
+    name="mistral_small",
+    format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
+    format_system=StringFormatter(slots=["[SYSTEM_PROMPT]{{content}}[/SYSTEM_PROMPT]"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS]{{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS]{"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]"),
+)
+
+
+register_template(
+    name="ministral3",
+    format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS]{{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS]{"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    template_class=Llama2Template,
+    mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]"),
+)
+
+
+register_template(
+    name="olmo",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"eos_token"}]),
+)
+
+
+register_template(
+    name="openchat",
+    format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+register_template(
+    name="openchat-3.6",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>GPT4 Correct User<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>"],
+)
+
+
+# copied from chatml template
+register_template(
+    name="opencoder",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    default_system="You are OpenCoder, created by OpenCoder Team.",
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="paligemma",
+    format_user=StringFormatter(slots=["{{content}}\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="paligemma", image_token="<image>"),
+    template_class=Llama2Template,
+)
+
+
+# copied from gemma template
+register_template(
+    name="paligemma_chat",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<end_of_turn>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="paligemma", image_token="<image>"),
+    template_class=Llama2Template,
+)
+
+
+register_template(
+    name="phi",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}<|end|>\n"]),
+    stop_words=["<|end|>"],
+    replace_eos=True,
+)
+
+
+register_template(
+    name="phi_small",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}<|end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"<|endoftext|>"}]),
+    stop_words=["<|end|>"],
+    replace_eos=True,
+)
+
+
+register_template(
+    name="phi4",
+    format_user=StringFormatter(
+        slots=["<|im_start|>user<|im_sep|>{{content}}<|im_end|><|im_start|>assistant<|im_sep|>"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>"]),
+    format_system=StringFormatter(slots=["<|im_start|>system<|im_sep|>{{content}}<|im_end|>"]),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+register_template(
+    name="phi4_mini",
+    format_user=StringFormatter(slots=["<|user|>{{content}}<|end|><|assistant|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end|>"]),
+    format_system=StringFormatter(slots=["<|system|>{{content}}<|end|>"]),
+    format_tools=StringFormatter(slots=["<|tool|>{{content}}<|/tool|>"]),
+    stop_words=["<|end|>"],
+    replace_eos=True,
+)
+
+
+# copied from ministral template
+register_template(
+    name="pixtral",
+    format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS]{{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS]{"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]"),
+    template_class=Llama2Template,
+)
+
+
+# copied from chatml template
+register_template(
+    name="qwen",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen3",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen3_nothink",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>", "<think>", "</think>"],
+    replace_eos=True,
+)
+
+
+# copied from chatml template
+register_template(
+    name="qwen2_audio",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen2_audio", audio_token="<|AUDIO|>"),
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen2_omni",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(
+        name="qwen2_omni",
+        image_token="<|IMAGE|>",
+        video_token="<|VIDEO|>",
+        audio_token="<|AUDIO|>",
+        vision_bos_token="<|vision_bos|>",
+        vision_eos_token="<|vision_eos|>",
+        audio_bos_token="<|audio_bos|>",
+        audio_eos_token="<|audio_eos|>",
+    ),
+)
+
+
+register_template(
+    name="qwen3_omni",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(
+        name="qwen2_omni", image_token="<|image_pad|>", video_token="<|video_pad|>", audio_token="<|audio_pad|>"
+    ),
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="qwen3_omni_nothink",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(
+        name="qwen2_omni", image_token="<|image_pad|>", video_token="<|video_pad|>", audio_token="<|audio_pad|>"
+    ),
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen2_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen3_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen3_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen3_vl_nothink",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen3_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+)
+
+
+register_template(
+    name="sailor",
+    format_user=StringFormatter(slots=["<|im_start|>question\n{{content}}<|im_end|>\n<|im_start|>answer\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    default_system=(
+        "You are an AI assistant named Sailor created by Sea AI Lab. "
+        "Your answer should be friendly, unbiased, faithful, informative and detailed."
+    ),
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="seed_coder",
+    format_user=StringFormatter(
+        slots=[{"bos_token"}, "user\n{{content}}", {"eos_token"}, {"bos_token"}, "assistant\n"]
+    ),
+    format_system=StringFormatter(slots=[{"bos_token"}, "system\n{{content}}", {"eos_token"}]),
+    default_system=(
+        "You are an AI programming assistant, utilizing the Seed-Coder model, developed by ByteDance Seed, "
+        "and you only answer questions related to computer science. For politically sensitive questions, "
+        "security and privacy issues, and other non-computer science questions, you will refuse to answer.\n\n"
+    ),
+)
+
+
+# copied from seed_coder
+register_template(
+    name="seed_oss",
+    format_user=StringFormatter(
+        slots=[{"bos_token"}, "user\n{{content}}", {"eos_token"}, {"bos_token"}, "assistant\n"]
+    ),
+    format_system=StringFormatter(slots=[{"bos_token"}, "system\n{{content}}", {"eos_token"}]),
+    format_function=FunctionFormatter(slots=[{"bos_token"}, "\n{{content}}", {"eos_token"}], tool_format="seed_oss"),
+    format_tools=ToolFormatter(tool_format="seed_oss"),
+    template_class=ReasoningTemplate,
+    thought_words=("<seed:think>", "</seed:think>"),
+)
+
+
+register_template(
+    name="smollm",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="smollm2",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    default_system="You are a helpful AI assistant named SmolLM, trained by Hugging Face.",
+)
+
+
+register_template(
+    name="solar",
+    format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]),
+    format_system=StringFormatter(slots=["### System:\n{{content}}\n\n"]),
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="starchat",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}<|end|>\n"]),
+    stop_words=["<|end|>"],
+)
+
+
+register_template(
+    name="telechat2",
+    format_user=StringFormatter(slots=["<_user>{{content}}<_bot>"]),
+    format_system=StringFormatter(slots=["<_system>{{content}}"]),
+    default_system=(
+        "你是中国电信星辰语义大模型，英文名是TeleChat，你是由中电信人工智能科技有限公司和中国电信人工智能研究院（TeleAI）研发的人工智能助手。"
+    ),
+)
+
+
+register_template(
+    name="vicuna",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="video_llava",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="video_llava", image_token="<image>", video_token="<video>"),
+)
+
+
+register_template(
+    name="xuanyuan",
+    format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
+    default_system=(
+        "以下是用户和人工智能助手之间的对话。用户以Human开头，人工智能助手以Assistant开头，"
+        "会对人类提出的问题给出有帮助、高质量、详细和礼貌的回答，并且总是拒绝参与与不道德、"
+        "不安全、有争议、政治敏感等相关的话题、问题和指示。\n"
+    ),
+)
+
+
+# copied from chatml template
+register_template(
+    name="yi",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="yi_vl",
+    format_user=StringFormatter(slots=["### Human: {{content}}\n### Assistant:"]),
+    format_assistant=StringFormatter(slots=["{{content}}\n"]),
+    default_system=(
+        "This is a chat between an inquisitive human and an AI assistant. "
+        "Assume the role of the AI assistant. Read all the images carefully, "
+        "and respond to the human's questions with informative, helpful, detailed and polite answers. "
+        "这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。"
+        "仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n\n"
+    ),
+    stop_words=["###"],
+    efficient_eos=True,
+    mm_plugin=get_mm_plugin(name="llava", image_token="<image>"),
+)
+
+
+register_template(
+    name="youtu",
+    format_user=StringFormatter(slots=["<|User|>{{content}}<|Assistant|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end_of_text|>"]),
+    format_system=StringFormatter(slots=["{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="default"),
+    format_observation=StringFormatter(slots=["<tool_response>\n{{content}}\n</tool_response><|Assistant|>"]),
+    format_tools=ToolFormatter(tool_format="default"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|end_of_text|>"],
+    replace_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="youtu_vl",
+    format_user=StringFormatter(
+        slots=["<|begin_of_text|>user\n{{content}}<|end_of_text|>\n<|begin_of_text|>assistant\n"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|end_of_text|>\n"]),
+    format_system=StringFormatter(slots=["<|begin_of_text|>system\n{{content}}<|end_of_text|>\n"]),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|end_of_text|>"],
+    mm_plugin=get_mm_plugin(name="youtu_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+)
+
+
+register_template(
+    name="yuan",
+    format_user=StringFormatter(slots=["{{content}}", {"token": "<sep>"}]),
+    format_assistant=StringFormatter(slots=["{{content}}<eod>\n"]),
+    stop_words=["<eod>"],
+)
+
+
+register_template(
+    name="zephyr",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]),
+    default_system="You are Zephyr, a helpful assistant.",
+)
diff --git a/LlamaFactory/src/llamafactory/data/tool_utils.py b/LlamaFactory/src/llamafactory/data/tool_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..18c6ad2f08d149bdb990de77d612ff0a373234fa
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/data/tool_utils.py
@@ -0,0 +1,676 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ast
+import json
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, NamedTuple, Union
+
+from typing_extensions import override
+
+
+class FunctionCall(NamedTuple):
+    name: str
+    arguments: str
+
+
+DEFAULT_TOOL_PROMPT = (
+    "You have access to the following tools:\n{tool_text}"
+    "Use the following format if using a tool:\n"
+    "```\n"
+    "Action: tool name (one of [{tool_names}])\n"
+    "Action Input: the input to the tool, in a JSON format representing the kwargs "
+    """(e.g. ```{{"input": "hello world", "num_beams": 5}}```)\n"""
+    "```\n"
+)
+
+GLM4_TOOL_PROMPT = (
+    "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱 AI 公司训练的语言模型 GLM-4 模型开发的，"
+    "你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{tool_text}"
+)
+
+GLM4_MOE_TOOL_PROMPT = (
+    "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\n"
+    "You are provided with function signatures within <tools></tools> XML tags:\n<tools>{tool_text}"
+    "\n</tools>\n\nFor each function call, output the function name and arguments within the following XML format:"
+    "\n<tool_call>{{function-name}}"
+    "\n<arg_key>{{arg-key-1}}</arg_key>"
+    "\n<arg_value>{{arg-value-1}}</arg_value>"
+    "\n<arg_key>{{arg-key-2}}</arg_key>"
+    "\n<arg_value>{{arg-value-2}}</arg_value>"
+    "\n...\n</tool_call>\n"
+)
+
+LLAMA3_TOOL_PROMPT = (
+    "Cutting Knowledge Date: December 2023\nToday Date: {date}\n\n"
+    "You have access to the following functions. To call a function, please respond with JSON for a function call. "
+    """Respond in the format {{"name": function name, "parameters": dictionary of argument name and its value}}. """
+    "Do not use variables.\n\n{tool_text}"
+)
+
+MINIMAX_M1_TOOL_PROMPT = (
+    "You are provided with these tools:\n<tools>\n{tool_text}</tools>\n\n"
+    "If you need to call tools, please respond with <tool_calls></tool_calls> XML tags, and provide tool-name and "
+    "json-object of arguments, following the format below:\n<tool_calls>\n"
+    """{{"name": <tool-name-1>, "arguments": <args-json-object-1>}}\n...\n</tool_calls>"""
+)
+
+MINIMAX_M2_TOOL_PROMPT = (
+    "\n\n# Tools\n\nYou may call one or more tools to assist with the user query.\n"
+    "Here are the tools available in JSONSchema format:\n\n<tools>\n{tool_text}</tools>\n\n"
+    "When making tool calls, use XML format to invoke tools and pass parameters:\n"
+    """\n<minimax:tool_call>\n<invoke name="tool-name-1">\n<parameter name="param-key-1">param-value-1</parameter>\n"""
+    """<parameter name="param-key-2">param-value-2</parameter>\n...\n</invoke>\n</minimax:tool_call>"""
+)
+
+QWEN_TOOL_PROMPT = (
+    "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\n"
+    "You are provided with function signatures within <tools></tools> XML tags:\n<tools>{tool_text}"
+    "\n</tools>\n\nFor each function call, return a json object with function name and arguments within "
+    """<tool_call></tool_call> XML tags:\n<tool_call>\n{{"name": <function-name>, """
+    """"arguments": <args-json-object>}}\n</tool_call>"""
+)
+
+SEED_TOOL_PROMPT = (
+    "system\nYou are Doubao, a helpful AI assistant. You may call one or more functions to assist with the user query."
+    "Tool List:\nYou are authorized to use the following tools (described in JSON Schema format). Before performing "
+    "any task, you must decide how to call them based on the descriptions and parameters of these tools.{tool_text}\n"
+    "工具调用请遵循如下格式:\n<seed:tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>value_1"
+    "</parameter>\n<parameter=example_parameter_2>This is the value for the second parameter\nthat can span\nmultiple "
+    "lines</parameter>\n</function>\n</seed:tool_call>\n"
+)
+
+LING_TOOL_PROMPT = (
+    "# Tools\n\nYou may call one or more functions to assist with the user query.\n\n"
+    "You are provided with function signatures within <tools></tools> XML tags:\n<tools>{tool_text}"
+    "\n</tools>\n\nFor each function call, return a json object with function name and arguments within "
+    """<tool_call></tool_call> XML tags:\n<tool_call>\n{{"name": <function-name>, """
+    """"arguments": <args-json-object>}}\n</tool_call>"""
+)
+
+LFM2_TOOL_PROMPT = "List of tools: <|tool_list_start|>{tool_text}<|tool_list_end|>"
+
+
+@dataclass
+class ToolUtils(ABC):
+    """Base class for tool utilities."""
+
+    @staticmethod
+    @abstractmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        r"""Generate the system message describing all the available tools."""
+        ...
+
+    @staticmethod
+    @abstractmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        r"""Generate the assistant message including all the tool calls."""
+        ...
+
+    @staticmethod
+    @abstractmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        r"""Extract all the function calls from the assistant message.
+
+        It should be an inverse function of `function_formatter`.
+        """
+        ...
+
+
+class DefaultToolUtils(ToolUtils):
+    r"""Default tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        tool_text = ""
+        tool_names = []
+        for tool in tools:
+            tool = tool.get("function", "") if tool.get("type") == "function" else tool
+            param_text = ""
+            for name, param in tool["parameters"]["properties"].items():
+                required, enum, items = "", "", ""
+                if name in tool["parameters"].get("required", []):
+                    required = ", required"
+
+                if param.get("enum", None):
+                    enum = ", should be one of [{}]".format(", ".join(param["enum"]))
+
+                if param.get("items", None):
+                    items = ", where each item should be {}".format(param["items"].get("type", ""))
+
+                param_text += "  - {name} ({type}{required}): {desc}{enum}{items}\n".format(
+                    name=name,
+                    type=param.get("type", ""),
+                    required=required,
+                    desc=param.get("description", ""),
+                    enum=enum,
+                    items=items,
+                )
+
+            tool_text += "> Tool Name: {name}\nTool Description: {desc}\nTool Args:\n{args}\n".format(
+                name=tool["name"], desc=tool.get("description", ""), args=param_text
+            )
+            tool_names.append(tool["name"])
+
+        return DEFAULT_TOOL_PROMPT.format(tool_text=tool_text, tool_names=", ".join(tool_names))
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        return "\n".join([f"Action: {name}\nAction Input: {arguments}" for name, arguments in functions])
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|\s*$)", re.DOTALL)
+        action_match: list[tuple[str, str]] = re.findall(regex, content)
+        if not action_match:
+            return content
+
+        results = []
+        for match in action_match:
+            tool_name = match[0].strip()
+            tool_input = match[1].strip().strip('"').strip("```")
+            try:
+                arguments = json.loads(tool_input)
+                results.append(FunctionCall(tool_name, json.dumps(arguments, ensure_ascii=False)))
+            except json.JSONDecodeError:
+                return content
+
+        return results
+
+
+class GLM4ToolUtils(ToolUtils):
+    r"""GLM-4 tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            tool = tool.get("function", "") if tool.get("type") == "function" else tool
+            tool_text += "\n\n## {name}\n\n{body}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
+                name=tool["name"], body=json.dumps(tool, indent=4, ensure_ascii=False)
+            )
+
+        return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        if len(functions) > 1:
+            raise ValueError("GLM-4 does not support parallel functions.")
+
+        return f"{functions[0].name}\n{functions[0].arguments}"
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        if "\n" not in content:
+            return content
+
+        tool_name, tool_input = content.split("\n", maxsplit=1)
+        try:
+            arguments = json.loads(tool_input.strip())
+        except json.JSONDecodeError:
+            return content
+
+        return [FunctionCall(tool_name, json.dumps(arguments, ensure_ascii=False))]
+
+
+class Llama3ToolUtils(ToolUtils):
+    r"""Llama 3.x tool using template with `tools_in_user_message=False`.
+
+    Reference: https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling
+    """
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        date = datetime.now().strftime("%d %b %Y")
+        tool_text = ""
+        for tool in tools:
+            wrapped_tool = tool if tool.get("type") == "function" else {"type": "function", "function": tool}
+            tool_text += json.dumps(wrapped_tool, indent=4, ensure_ascii=False) + "\n\n"
+
+        return LLAMA3_TOOL_PROMPT.format(date=date, tool_text=tool_text)
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        function_objects = [{"name": name, "parameters": json.loads(arguments)} for name, arguments in functions]
+        return json.dumps(function_objects[0] if len(function_objects) == 1 else function_objects, ensure_ascii=False)
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        try:
+            tools = json.loads(content.strip())
+        except json.JSONDecodeError:
+            return content
+
+        tools = [tools] if not isinstance(tools, list) else tools
+        try:
+            return [FunctionCall(tool["name"], json.dumps(tool["parameters"], ensure_ascii=False)) for tool in tools]
+        except KeyError:
+            return content
+
+
+class MiniMaxM1ToolUtils(ToolUtils):
+    r"""MiniMax-M1 tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            tool = tool.get("function", "") if tool.get("type") == "function" else tool
+            tool_text += json.dumps(tool, ensure_ascii=False) + "\n"
+
+        return MINIMAX_M1_TOOL_PROMPT.format(tool_text=tool_text)
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        function_texts = []
+        for func in functions:
+            name, arguments = func.name, json.loads(func.arguments)
+            function_texts.append(json.dumps({"name": name, "arguments": arguments}, ensure_ascii=False))
+
+        return "<tool_calls>\n" + "\n".join(function_texts) + "\n</tool_calls>"
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        regex = re.compile(r"<tool_calls>\s*(.+?)\s*</tool_calls>", re.DOTALL)
+        tool_match = re.search(regex, content)
+        if not tool_match:
+            return content
+
+        tool_calls_content = tool_match.group(1)
+        results = []
+        for line in tool_calls_content.split("\n"):
+            line = line.strip()
+            if not line:
+                continue
+
+            try:
+                tool_call = json.loads(line)
+                results.append(FunctionCall(tool_call["name"], json.dumps(tool_call["arguments"], ensure_ascii=False)))
+            except json.JSONDecodeError:
+                continue
+
+        return results
+
+
+class MiniMaxM2ToolUtils(ToolUtils):
+    r"""MiniMax-M2 tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            tool = tool.get("function", "") if tool.get("type") == "function" else tool
+            tool_text += "<tool>" + json.dumps(tool, ensure_ascii=False) + "</tool>\n"
+
+        return MINIMAX_M2_TOOL_PROMPT.format(tool_text=tool_text)
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        function_texts = []
+        for func in functions:
+            name, arguments = func.name, json.loads(func.arguments)
+            prompt = f'<invoke name="{name}">'
+            for key, value in arguments.items():
+                prompt += f'\n<parameter name="{key}">'
+                if not isinstance(value, str):
+                    value = json.dumps(value, ensure_ascii=False)
+                prompt += value + "</parameter>"
+            prompt += "\n</invoke>"
+            function_texts.append(prompt)
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        regex = re.compile(r"<minimax:tool_call>\s*(.+?)\s*</minimax:tool_call>", re.DOTALL)
+        tool_match = re.search(regex, content)
+        if not tool_match:
+            return content
+
+        tool_calls_content = tool_match.group(1)
+        invoke_regex = re.compile(r"<invoke name=\"(.*?)\">(.*?)</invoke>", re.DOTALL)
+        results = []
+
+        for func_name, params_block in re.findall(invoke_regex, tool_calls_content):
+            args_dict = {}
+            param_pattern = re.compile(r"<parameter name=\"(.*?)\">(.*?)</parameter>", re.DOTALL)
+            for key, raw_value in re.findall(param_pattern, params_block):
+                value = raw_value.strip()
+                try:
+                    parsed_value = json.loads(value)
+                except json.JSONDecodeError:
+                    parsed_value = raw_value
+                args_dict[key] = parsed_value
+
+            results.append(FunctionCall(func_name.strip(), json.dumps(args_dict, ensure_ascii=False)))
+
+        return results
+
+
+class MistralToolUtils(ToolUtils):
+    r"""Mistral v0.3 tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        wrapped_tools = []
+        for tool in tools:
+            wrapped_tools.append(tool if tool.get("type") == "function" else {"type": "function", "function": tool})
+
+        return "[AVAILABLE_TOOLS] " + json.dumps(wrapped_tools, ensure_ascii=False) + "[/AVAILABLE_TOOLS]"
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        return json.dumps(
+            [{"name": name, "arguments": json.loads(arguments)} for name, arguments in functions], ensure_ascii=False
+        )
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        try:
+            tools = json.loads(content.strip())
+        except json.JSONDecodeError:
+            return content
+
+        tools = [tools] if not isinstance(tools, list) else tools
+        try:
+            return [FunctionCall(tool["name"], json.dumps(tool["arguments"], ensure_ascii=False)) for tool in tools]
+        except KeyError:
+            return content
+
+
+class QwenToolUtils(ToolUtils):
+    r"""Qwen 2.5 tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            wrapped_tool = tool if tool.get("type") == "function" else {"type": "function", "function": tool}
+            tool_text += "\n" + json.dumps(wrapped_tool, ensure_ascii=False)
+
+        return QWEN_TOOL_PROMPT.format(tool_text=tool_text)
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        function_texts = [
+            json.dumps({"name": name, "arguments": json.loads(arguments)}, ensure_ascii=False)
+            for name, arguments in functions
+        ]
+        return "\n".join([f"<tool_call>\n{text}\n</tool_call>" for text in function_texts])
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        regex = re.compile(r"<tool_call>(.+?)</tool_call>(?=\s*<tool_call>|\s*$)", re.DOTALL)
+        tool_match: list[str] = re.findall(regex, content)
+        if not tool_match:
+            return content
+
+        results = []
+        for tool in tool_match:
+            try:
+                tool = json.loads(tool.strip())
+            except json.JSONDecodeError:
+                return content
+
+            if "name" not in tool or "arguments" not in tool:
+                return content
+
+            results.append(FunctionCall(tool["name"], json.dumps(tool["arguments"], ensure_ascii=False)))
+
+        return results
+
+
+class GLM4MOEToolUtils(QwenToolUtils):
+    r"""GLM-4-MOE tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            wrapped_tool = tool if tool.get("type") == "function" else {"type": "function", "function": tool}
+            tool_text += "\n" + json.dumps(wrapped_tool, ensure_ascii=False)
+
+        return GLM4_MOE_TOOL_PROMPT.format(tool_text=tool_text)
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        function_json = [
+            {"func_name": name, "func_key_values": json.loads(arguments)} for name, arguments in functions
+        ]
+        function_texts = []
+        for func in function_json:
+            prompt = "\n<tool_call>" + func["func_name"]
+            for key, value in func["func_key_values"].items():
+                prompt += "\n<arg_key>" + key + "</arg_key>"
+                if not isinstance(value, str):
+                    value = json.dumps(value, ensure_ascii=False)
+                prompt += "\n<arg_value>" + value + "</arg_value>"
+            function_texts.append(prompt)
+
+        return "\n".join(function_texts)
+
+
+class SeedToolUtils(ToolUtils):
+    r"""Seed tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        return SEED_TOOL_PROMPT.format(tool_text="\n" + json.dumps(tools, ensure_ascii=False))
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        function_json = [
+            {"func_name": name, "func_key_values": json.loads(arguments)} for name, arguments in functions
+        ]
+        function_texts = []
+        for func in function_json:
+            prompt = "\n<seed:tool_call>\n<function=" + func["func_name"]
+            for key, value in func["func_key_values"].items():
+                prompt += "\n<parameter=" + key + ">"
+                if not isinstance(value, str):
+                    value = json.dumps(value, ensure_ascii=False)
+                prompt += value + "</parameter>"
+            prompt += "\n</function>\n</seed:tool_call>"
+            function_texts.append(prompt)
+
+        return "\n".join(function_texts)
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        results = []
+        regex = re.compile(
+            r"<seed:tool_call>\s*<function=\s*([^\s<]+)\s*(.*?)\s*</function>\s*</seed:tool_call>", re.DOTALL
+        )
+        for func_name, params_block in re.findall(regex, content):
+            args_dict = {}
+            param_pattern = re.compile(r"<parameter=(.*?)>(.*?)</parameter>", re.DOTALL)
+            for key, raw_value in re.findall(param_pattern, params_block.strip()):
+                value = raw_value.strip()
+                try:
+                    parsed_value = json.loads(value)
+                except json.JSONDecodeError:
+                    parsed_value = raw_value
+                args_dict[key] = parsed_value
+
+            results.append(FunctionCall(func_name.strip(), json.dumps(args_dict, ensure_ascii=False)))
+
+        return results
+
+
+class LingToolUtils(QwenToolUtils):
+    r"""Ling v2 tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            wrapped_tool = tool if tool.get("type") == "function" else {"type": "function", "function": tool}
+            tool_text += "\n" + json.dumps(wrapped_tool, ensure_ascii=False)
+
+        return LING_TOOL_PROMPT.format(tool_text=tool_text) + "\n" + "detailed thinking off"
+
+
+class LFM2ToolUtils(ToolUtils):
+    r"""LFM2.5 tool using template with Pythonic function call syntax."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        tool_list = []
+        for tool in tools:
+            tool = tool.get("function", tool) if tool.get("type") == "function" else tool
+            tool_list.append(tool)
+
+        return LFM2_TOOL_PROMPT.format(tool_text=json.dumps(tool_list, ensure_ascii=False))
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        calls = []
+        for name, args_json in functions:
+            args = json.loads(args_json)
+            kwargs_parts = []
+            for key, value in args.items():
+                if isinstance(value, str):
+                    kwargs_parts.append(f'{key}="{value}"')
+                else:
+                    kwargs_parts.append(f"{key}={json.dumps(value, ensure_ascii=False)}")
+
+            calls.append(f"{name}({', '.join(kwargs_parts)})")
+
+        return f"<|tool_call_start|>[{', '.join(calls)}]<|tool_call_end|>"
+
+    @staticmethod
+    def _ast_to_value(node: ast.AST) -> Any:
+        """Convert an AST node to a Python value, handling JSON-style booleans/null."""
+        # Handle JSON-style true/false/null as Name nodes
+        if isinstance(node, ast.Name):
+            if node.id == "true":
+                return True
+            elif node.id == "false":
+                return False
+            elif node.id == "null":
+                return None
+            else:
+                raise ValueError(f"Unknown identifier: {node.id}")
+
+        # Use literal_eval for other cases (strings, numbers, lists, dicts)
+        return ast.literal_eval(node)
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        # Extract content between tool call markers
+        start_marker = "<|tool_call_start|>"
+        end_marker = "<|tool_call_end|>"
+
+        start_idx = content.find(start_marker)
+        if start_idx == -1:
+            return content
+
+        end_idx = content.find(end_marker, start_idx)
+        if end_idx == -1:
+            return content
+
+        tool_call_str = content[start_idx + len(start_marker) : end_idx].strip()
+
+        # Parse Pythonic function call syntax using AST
+        try:
+            tree = ast.parse(tool_call_str, mode="eval")
+        except SyntaxError:
+            return content
+
+        # Handle both single call and list of calls
+        if isinstance(tree.body, ast.List):
+            call_nodes = tree.body.elts
+        elif isinstance(tree.body, ast.Call):
+            call_nodes = [tree.body]
+        else:
+            return content
+
+        results = []
+        for node in call_nodes:
+            if not isinstance(node, ast.Call):
+                return content
+
+            # Extract function name
+            if isinstance(node.func, ast.Name):
+                func_name = node.func.id
+            else:
+                return content
+
+            # Extract keyword arguments
+            args_dict = {}
+            for keyword in node.keywords:
+                key = keyword.arg
+                try:
+                    value = LFM2ToolUtils._ast_to_value(keyword.value)
+                except (ValueError, SyntaxError):
+                    return content
+                args_dict[key] = value
+
+            results.append(FunctionCall(func_name, json.dumps(args_dict, ensure_ascii=False)))
+
+        return results if results else content
+
+
+TOOLS = {
+    "default": DefaultToolUtils(),
+    "glm4": GLM4ToolUtils(),
+    "llama3": Llama3ToolUtils(),
+    "lfm2": LFM2ToolUtils(),
+    "minimax1": MiniMaxM1ToolUtils(),
+    "minimax2": MiniMaxM2ToolUtils(),
+    "mistral": MistralToolUtils(),
+    "qwen": QwenToolUtils(),
+    "glm4_moe": GLM4MOEToolUtils(),
+    "seed_oss": SeedToolUtils(),
+    "ling": LingToolUtils(),
+}
+
+
+def get_tool_utils(name: str) -> "ToolUtils":
+    tool_utils = TOOLS.get(name, None)
+    if tool_utils is None:
+        raise ValueError(f"Tool utils `{name}` not found.")
+
+    return tool_utils
diff --git a/LlamaFactory/src/llamafactory/eval/__init__.py b/LlamaFactory/src/llamafactory/eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/eval/evaluator.py b/LlamaFactory/src/llamafactory/eval/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..7729c59bf413cb66054a3e06f2e42d6794cb495d
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/eval/evaluator.py
@@ -0,0 +1,158 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# This code is inspired by the Dan's test library.
+# https://github.com/hendrycks/test/blob/master/evaluate_flan.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2020 Dan Hendrycks
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import json
+import os
+from typing import TYPE_CHECKING, Any, Optional
+
+import numpy as np
+import torch
+from datasets import load_dataset
+from tqdm import tqdm, trange
+from transformers.utils import cached_file
+
+from ..data import get_template_and_fix_tokenizer
+from ..extras.constants import CHOICES, SUBJECTS
+from ..hparams import get_eval_args
+from ..model import load_model, load_tokenizer
+from .template import get_eval_template
+
+
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+
+class Evaluator:
+    def __init__(self, args: Optional[dict[str, Any]] = None) -> None:
+        self.model_args, self.data_args, self.eval_args, finetuning_args = get_eval_args(args)
+        self.tokenizer = load_tokenizer(self.model_args)["tokenizer"]
+        self.tokenizer.padding_side = "right"  # avoid overflow issue in batched inference for llama2
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args)
+        self.model = load_model(self.tokenizer, self.model_args, finetuning_args)
+        self.eval_template = get_eval_template(self.eval_args.lang)
+        self.choice_inputs = [self.tokenizer.encode(ch, add_special_tokens=False)[-1] for ch in CHOICES]
+
+    @torch.inference_mode()
+    def batch_inference(self, batch_input: dict[str, "torch.Tensor"]) -> list[str]:
+        logits = self.model(**batch_input).logits
+        lengths = torch.sum(batch_input["attention_mask"], dim=-1)
+        word_probs = torch.stack([logits[i, lengths[i] - 1] for i in range(len(lengths))], dim=0)
+        choice_probs = torch.nn.functional.softmax(word_probs[:, self.choice_inputs], dim=-1).detach()
+        return [chr(ord("A") + offset.item()) for offset in torch.argmax(choice_probs, dim=-1)]
+
+    def eval(self) -> None:
+        eval_task = self.eval_args.task.split("_")[0]
+        eval_split = self.eval_args.task.split("_")[1]
+
+        mapping = cached_file(
+            path_or_repo_id=os.path.join(self.eval_args.task_dir, eval_task),
+            filename="mapping.json",
+            cache_dir=self.model_args.cache_dir,
+            token=self.model_args.hf_hub_token,
+        )
+
+        with open(mapping, encoding="utf-8") as f:
+            categorys: dict[str, dict[str, str]] = json.load(f)
+
+        category_corrects = {subj: np.array([], dtype="bool") for subj in SUBJECTS}
+        pbar = tqdm(categorys.keys(), desc="Processing subjects", position=0)
+        results = {}
+        for subject in pbar:
+            dataset = load_dataset(
+                path=os.path.join(self.eval_args.task_dir, eval_task),
+                name=subject,
+                cache_dir=self.model_args.cache_dir,
+                download_mode=self.eval_args.download_mode,
+                token=self.model_args.hf_hub_token,
+                trust_remote_code=self.model_args.trust_remote_code,
+            )
+            pbar.set_postfix_str(categorys[subject]["name"])
+            inputs, outputs, labels = [], [], []
+            for i in trange(len(dataset[eval_split]), desc="Formatting batches", position=1, leave=False):
+                support_set = (
+                    dataset["train"].shuffle().select(range(min(self.eval_args.n_shot, len(dataset["train"]))))
+                )
+                messages = self.eval_template.format_example(
+                    target_data=dataset[eval_split][i],
+                    support_set=support_set,
+                    subject_name=categorys[subject]["name"],
+                )
+
+                input_ids, _ = self.template.encode_oneturn(tokenizer=self.tokenizer, messages=messages)
+                inputs.append({"input_ids": input_ids, "attention_mask": [1] * len(input_ids)})
+                labels.append(messages[-1]["content"])
+
+            for i in trange(
+                0, len(inputs), self.eval_args.batch_size, desc="Predicting batches", position=1, leave=False
+            ):
+                batch_input = self.tokenizer.pad(
+                    inputs[i : i + self.eval_args.batch_size], return_attention_mask=True, return_tensors="pt"
+                ).to(self.model.device)
+                preds = self.batch_inference(batch_input)
+                outputs += preds
+
+            corrects = np.array(outputs) == np.array(labels)
+            category_name = categorys[subject]["category"]
+            category_corrects[category_name] = np.concatenate([category_corrects[category_name], corrects], axis=0)
+            category_corrects["Average"] = np.concatenate([category_corrects["Average"], corrects], axis=0)
+            results[subject] = {str(i): outputs[i] for i in range(len(outputs))}
+
+        pbar.close()
+        self._save_results(category_corrects, results)
+
+    def _save_results(self, category_corrects: dict[str, "NDArray"], results: dict[str, dict[int, str]]) -> None:
+        score_info = "\n".join(
+            [
+                f"{category_name:>15}: {100 * np.mean(category_correct):.2f}"
+                for category_name, category_correct in category_corrects.items()
+                if len(category_correct)
+            ]
+        )
+        print(score_info)
+        if self.eval_args.save_dir is not None:
+            os.makedirs(self.eval_args.save_dir, exist_ok=False)
+            with open(os.path.join(self.eval_args.save_dir, "results.json"), "w", encoding="utf-8", newline="\n") as f:
+                json.dump(results, f, indent=2)
+
+            with open(os.path.join(self.eval_args.save_dir, "results.log"), "w", encoding="utf-8", newline="\n") as f:
+                f.write(score_info)
+
+
+def run_eval() -> None:
+    Evaluator().eval()
diff --git a/LlamaFactory/src/llamafactory/eval/template.py b/LlamaFactory/src/llamafactory/eval/template.py
new file mode 100644
index 0000000000000000000000000000000000000000..5742469787a5001001a2702f183306bd2a312aef
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/eval/template.py
@@ -0,0 +1,79 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+from ..data import Role
+from ..extras.constants import CHOICES
+
+
+@dataclass
+class EvalTemplate:
+    system: str
+    choice: str
+    answer: str
+
+    def _parse_example(self, example: dict[str, str]) -> tuple[str, str]:
+        r"""Parse eval example.
+
+        input: a dict with keys {"question", "A", "B", "C", "D", "answer"}
+        output: a tuple of (prompt, response).
+        """
+        candidates = [self.choice.format(choice=ch, content=example[ch]) for ch in CHOICES if ch in example]
+        return "".join([example["question"]] + candidates + [self.answer]), example["answer"]
+
+    def format_example(
+        self, target_data: dict[str, str], support_set: list[dict[str, str]], subject_name: str
+    ) -> list[dict[str, str]]:
+        r"""Convert dataset examples to messages."""
+        messages = []
+        for k in range(len(support_set)):
+            prompt, response = self._parse_example(support_set[k])
+            messages.append({"role": Role.USER.value, "content": prompt})
+            messages.append({"role": Role.ASSISTANT.value, "content": response})
+
+        prompt, response = self._parse_example(target_data)
+        messages.append({"role": Role.USER.value, "content": prompt})
+        messages.append({"role": Role.ASSISTANT.value, "content": response})
+        messages[0]["content"] = self.system.format(subject=subject_name) + messages[0]["content"]
+        return messages
+
+
+eval_templates: dict[str, "EvalTemplate"] = {}
+
+
+def _register_eval_template(name: str, system: str, choice: str, answer: str) -> None:
+    eval_templates[name] = EvalTemplate(system=system, choice=choice, answer=answer)
+
+
+def get_eval_template(name: str) -> "EvalTemplate":
+    eval_template = eval_templates.get(name, None)
+    assert eval_template is not None, f"Template {name} does not exist."
+    return eval_template
+
+
+_register_eval_template(
+    name="en",
+    system="The following are multiple choice questions (with answers) about {subject}.\n\n",
+    choice="\n{choice}. {content}",
+    answer="\nAnswer:",
+)
+
+
+_register_eval_template(
+    name="zh",
+    system="以下是中国关于{subject}考试的单项选择题，请选出其中的正确答案。\n\n",
+    choice="\n{choice}. {content}",
+    answer="\n答案：",
+)
diff --git a/LlamaFactory/src/llamafactory/extras/__init__.py b/LlamaFactory/src/llamafactory/extras/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/extras/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/extras/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84abaf20a81749b56a1cd368676bd988efbbd5f2
Binary files /dev/null and b/LlamaFactory/src/llamafactory/extras/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/extras/__pycache__/__init__.cpython-312.pyc b/LlamaFactory/src/llamafactory/extras/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e20f2fcf89ee1c9496a185b2520047566f55a44
Binary files /dev/null and b/LlamaFactory/src/llamafactory/extras/__pycache__/__init__.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/extras/__pycache__/constants.cpython-311.pyc b/LlamaFactory/src/llamafactory/extras/__pycache__/constants.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76a8cf04ffc2f74b54f7a06a0694b25af682a7f2
Binary files /dev/null and b/LlamaFactory/src/llamafactory/extras/__pycache__/constants.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/extras/__pycache__/env.cpython-311.pyc b/LlamaFactory/src/llamafactory/extras/__pycache__/env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..483915bf3b082e134c0045f45ccc612ba0ba55ae
Binary files /dev/null and b/LlamaFactory/src/llamafactory/extras/__pycache__/env.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/extras/__pycache__/env.cpython-312.pyc b/LlamaFactory/src/llamafactory/extras/__pycache__/env.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf2c332552f21142f36f0f1a262f76506aab8a2e
Binary files /dev/null and b/LlamaFactory/src/llamafactory/extras/__pycache__/env.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/extras/__pycache__/logging.cpython-311.pyc b/LlamaFactory/src/llamafactory/extras/__pycache__/logging.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02c4bd7f71b73eb812753ef963dd2c5e76c6654c
Binary files /dev/null and b/LlamaFactory/src/llamafactory/extras/__pycache__/logging.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/extras/__pycache__/logging.cpython-312.pyc b/LlamaFactory/src/llamafactory/extras/__pycache__/logging.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55e500f955bbd3e90ac385486f4e74b89e2eb032
Binary files /dev/null and b/LlamaFactory/src/llamafactory/extras/__pycache__/logging.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/extras/__pycache__/misc.cpython-311.pyc b/LlamaFactory/src/llamafactory/extras/__pycache__/misc.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e4c209aed84ae06da8df5a848ed370b6a73e2aa
Binary files /dev/null and b/LlamaFactory/src/llamafactory/extras/__pycache__/misc.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/extras/__pycache__/misc.cpython-312.pyc b/LlamaFactory/src/llamafactory/extras/__pycache__/misc.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4eede0856be84d0060bdb623f4137f0feaa4fe57
Binary files /dev/null and b/LlamaFactory/src/llamafactory/extras/__pycache__/misc.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/extras/__pycache__/packages.cpython-311.pyc b/LlamaFactory/src/llamafactory/extras/__pycache__/packages.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d967211987d0780ecacaeff61cff43aec17d2fe9
Binary files /dev/null and b/LlamaFactory/src/llamafactory/extras/__pycache__/packages.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/extras/__pycache__/packages.cpython-312.pyc b/LlamaFactory/src/llamafactory/extras/__pycache__/packages.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70d1f6bd66abdcc1c29cfdd7a635c9e4bd0d912f
Binary files /dev/null and b/LlamaFactory/src/llamafactory/extras/__pycache__/packages.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/extras/__pycache__/ploting.cpython-311.pyc b/LlamaFactory/src/llamafactory/extras/__pycache__/ploting.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0644f0f61fa9ed3e7f566b8c806b965fe89bdb73
Binary files /dev/null and b/LlamaFactory/src/llamafactory/extras/__pycache__/ploting.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/extras/constants.py b/LlamaFactory/src/llamafactory/extras/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..724ba1a2c126b83aa8d5fb0435f23d43ed72bec1
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/extras/constants.py
@@ -0,0 +1,3425 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import OrderedDict, defaultdict
+from enum import Enum, unique
+
+from peft.utils import SAFETENSORS_WEIGHTS_NAME as SAFE_ADAPTER_WEIGHTS_NAME
+from peft.utils import WEIGHTS_NAME as ADAPTER_WEIGHTS_NAME
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME
+
+
+AUDIO_PLACEHOLDER = os.getenv("AUDIO_PLACEHOLDER", "<audio>")
+
+CHECKPOINT_NAMES = {
+    SAFE_ADAPTER_WEIGHTS_NAME,
+    ADAPTER_WEIGHTS_NAME,
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+}
+
+CHOICES = ["A", "B", "C", "D"]
+
+DATA_CONFIG = "dataset_info.json"
+
+DEFAULT_TEMPLATE = defaultdict(str)
+
+FILEEXT2TYPE = {
+    "arrow": "arrow",
+    "csv": "csv",
+    "json": "json",
+    "jsonl": "json",
+    "parquet": "parquet",
+    "txt": "text",
+}
+
+IGNORE_INDEX = -100
+
+IMAGE_PLACEHOLDER = os.getenv("IMAGE_PLACEHOLDER", "<image>")
+
+LAYERNORM_NAMES = {"norm", "ln"}
+
+LLAMABOARD_CONFIG = "llamaboard_config.yaml"
+
+MCA_SUPPORTED_MODELS = {
+    "deepseek_v3",
+    "glm4_moe",
+    "llama",
+    "mistral",
+    "mixtral",
+    "qwen2",
+    "qwen2_vl",
+    "qwen2_5_vl",
+    "qwen3_vl",
+    "qwen3",
+    "qwen3_moe",
+    "qwen3_next",
+}
+
+METHODS = ["full", "freeze", "lora", "oft"]
+
+MOD_SUPPORTED_MODELS = {"bloom", "falcon", "gemma", "llama", "mistral", "mixtral", "phi", "starcoder2"}
+
+MULTIMODAL_SUPPORTED_MODELS = set()
+
+PEFT_METHODS = {"lora", "oft"}
+
+RUNNING_LOG = "running_log.txt"
+
+SUBJECTS = ["Average", "STEM", "Social Sciences", "Humanities", "Other"]
+
+SUPPORTED_MODELS = OrderedDict()
+
+TRAINER_LOG = "trainer_log.jsonl"
+
+TRAINING_ARGS = "training_args.yaml"
+
+TRAINING_STAGES = {
+    "Supervised Fine-Tuning": "sft",
+    "Reward Modeling": "rm",
+    "PPO": "ppo",
+    "DPO": "dpo",
+    "KTO": "kto",
+    "Pre-Training": "pt",
+}
+
+STAGES_USE_PAIR_DATA = {"rm", "dpo"}
+
+SUPPORTED_CLASS_FOR_S2ATTN = {"llama"}
+
+SWANLAB_CONFIG = "swanlab_public_config.json"
+
+VIDEO_PLACEHOLDER = os.getenv("VIDEO_PLACEHOLDER", "<video>")
+
+V_HEAD_WEIGHTS_NAME = "value_head.bin"
+
+V_HEAD_SAFE_WEIGHTS_NAME = "value_head.safetensors"
+
+
+class AttentionFunction(str, Enum):
+    AUTO = "auto"
+    DISABLED = "disabled"
+    SDPA = "sdpa"
+    FA2 = "fa2"
+    FA3 = "fa3"
+
+
+class EngineName(str, Enum):
+    HF = "huggingface"
+    VLLM = "vllm"
+    SGLANG = "sglang"
+    KT = "ktransformers"
+
+
+class DownloadSource(str, Enum):
+    DEFAULT = "hf"
+    MODELSCOPE = "ms"
+    OPENMIND = "om"
+
+
+@unique
+class QuantizationMethod(str, Enum):
+    r"""Borrowed from `transformers.utils.quantization_config.QuantizationMethod`."""
+
+    BNB = "bnb"
+    GPTQ = "gptq"
+    AWQ = "awq"
+    AQLM = "aqlm"
+    QUANTO = "quanto"
+    EETQ = "eetq"
+    HQQ = "hqq"
+    MXFP4 = "mxfp4"
+    FP8 = "fp8"
+
+
+class RopeScaling(str, Enum):
+    LINEAR = "linear"
+    DYNAMIC = "dynamic"
+    YARN = "yarn"
+    LLAMA3 = "llama3"
+
+
+def register_model_group(
+    models: dict[str, dict[DownloadSource, str]],
+    template: str | None = None,
+    multimodal: bool = False,
+) -> None:
+    for name, path in models.items():
+        SUPPORTED_MODELS[name] = path
+        if template is not None and (
+            any(suffix in name for suffix in ("-Chat", "-Distill", "-Instruct", "-Thinking")) or multimodal
+        ):
+            DEFAULT_TEMPLATE[name] = template
+
+        if multimodal:
+            MULTIMODAL_SUPPORTED_MODELS.add(name)
+
+
+register_model_group(
+    models={
+        "Aya-23-8B-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/aya-23-8B",
+        },
+        "Aya-23-35B-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/aya-23-35B",
+        },
+    },
+    template="cohere",
+)
+
+
+register_model_group(
+    models={
+        "BLOOM-560M": {
+            DownloadSource.DEFAULT: "bigscience/bloom-560m",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-560m",
+        },
+        "BLOOM-3B": {
+            DownloadSource.DEFAULT: "bigscience/bloom-3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-3b",
+        },
+        "BLOOM-7B1": {
+            DownloadSource.DEFAULT: "bigscience/bloom-7b1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-7b1",
+        },
+    },
+)
+
+
+register_model_group(
+    models={
+        "BLOOMZ-560M": {
+            DownloadSource.DEFAULT: "bigscience/bloomz-560m",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-560m",
+        },
+        "BLOOMZ-3B": {
+            DownloadSource.DEFAULT: "bigscience/bloomz-3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-3b",
+        },
+        "BLOOMZ-7B1-mt": {
+            DownloadSource.DEFAULT: "bigscience/bloomz-7b1-mt",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-7b1-mt",
+        },
+    },
+)
+
+
+register_model_group(
+    models={
+        "Breeze-7B": {
+            DownloadSource.DEFAULT: "MediaTek-Research/Breeze-7B-Base-v1_0",
+        },
+        "Breeze-7B-Instruct": {
+            DownloadSource.DEFAULT: "MediaTek-Research/Breeze-7B-Instruct-v1_0",
+        },
+    },
+    template="breeze",
+)
+
+
+register_model_group(
+    models={
+        "ChatGLM3-6B-Base": {
+            DownloadSource.DEFAULT: "zai-org/chatglm3-6b-base",
+            DownloadSource.MODELSCOPE: "ZhipuAI/chatglm3-6b-base",
+        },
+        "ChatGLM3-6B-Chat": {
+            DownloadSource.DEFAULT: "zai-org/chatglm3-6b",
+            DownloadSource.MODELSCOPE: "ZhipuAI/chatglm3-6b",
+        },
+    },
+    template="chatglm3",
+)
+
+
+register_model_group(
+    models={
+        "Chinese-Llama-2-1.3B": {
+            DownloadSource.DEFAULT: "hfl/chinese-llama-2-1.3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-1.3b",
+        },
+        "Chinese-Llama-2-7B": {
+            DownloadSource.DEFAULT: "hfl/chinese-llama-2-7b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-7b",
+        },
+        "Chinese-Llama-2-13B": {
+            DownloadSource.DEFAULT: "hfl/chinese-llama-2-13b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-13b",
+        },
+        "Chinese-Alpaca-2-1.3B-Chat": {
+            DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-1.3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-1.3b",
+        },
+        "Chinese-Alpaca-2-7B-Chat": {
+            DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-7b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-7b",
+        },
+        "Chinese-Alpaca-2-13B-Chat": {
+            DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-13b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-13b",
+        },
+    },
+    template="llama2_zh",
+)
+
+
+register_model_group(
+    models={
+        "CodeGemma-7B": {
+            DownloadSource.DEFAULT: "google/codegemma-7b",
+        },
+        "CodeGemma-7B-Instruct": {
+            DownloadSource.DEFAULT: "google/codegemma-7b-it",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/codegemma-7b-it",
+        },
+        "CodeGemma-1.1-2B": {
+            DownloadSource.DEFAULT: "google/codegemma-1.1-2b",
+        },
+        "CodeGemma-1.1-7B-Instruct": {
+            DownloadSource.DEFAULT: "google/codegemma-1.1-7b-it",
+        },
+    },
+    template="gemma",
+)
+
+
+register_model_group(
+    models={
+        "Codestral-22B-v0.1-Chat": {
+            DownloadSource.DEFAULT: "mistralai/Codestral-22B-v0.1",
+            DownloadSource.MODELSCOPE: "swift/Codestral-22B-v0.1",
+        },
+    },
+    template="mistral",
+)
+
+
+register_model_group(
+    models={
+        "CommandR-35B-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-v01",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/c4ai-command-r-v01",
+        },
+        "CommandR-Plus-104B-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-plus",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/c4ai-command-r-plus",
+        },
+        "CommandR-35B-4bit-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-v01-4bit",
+            DownloadSource.MODELSCOPE: "mirror013/c4ai-command-r-v01-4bit",
+        },
+        "CommandR-Plus-104B-4bit-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-plus-4bit",
+        },
+    },
+    template="cohere",
+)
+
+
+register_model_group(
+    models={
+        "DBRX-132B-Base": {
+            DownloadSource.DEFAULT: "databricks/dbrx-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/dbrx-base",
+        },
+        "DBRX-132B-Instruct": {
+            DownloadSource.DEFAULT: "databricks/dbrx-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/dbrx-instruct",
+        },
+    },
+    template="dbrx",
+)
+
+
+register_model_group(
+    models={
+        "DeepSeek-LLM-7B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-7b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-7b-base",
+        },
+        "DeepSeek-LLM-67B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-67b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-67b-base",
+        },
+        "DeepSeek-LLM-7B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-7b-chat",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-7b-chat",
+        },
+        "DeepSeek-LLM-67B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-67b-chat",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-67b-chat",
+        },
+        "DeepSeek-Math-7B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-math-7b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-math-7b-base",
+        },
+        "DeepSeek-Math-7B-Instruct": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-math-7b-instruct",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-math-7b-instruct",
+        },
+        "DeepSeek-MoE-16B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-base",
+        },
+        "DeepSeek-MoE-16B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-chat",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-chat",
+        },
+        "DeepSeek-V2-16B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Lite",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Lite",
+        },
+        "DeepSeek-V2-236B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2",
+        },
+        "DeepSeek-V2-16B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Lite-Chat",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Lite-Chat",
+        },
+        "DeepSeek-V2-236B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat",
+        },
+        "DeepSeek-Coder-V2-16B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Lite-Base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-Coder-V2-Lite-Base",
+        },
+        "DeepSeek-Coder-V2-236B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-Coder-V2-Base",
+        },
+        "DeepSeek-Coder-V2-16B-Instruct": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
+        },
+        "DeepSeek-Coder-V2-236B-Instruct": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Instruct",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-Coder-V2-Instruct",
+        },
+    },
+    template="deepseek",
+)
+
+
+register_model_group(
+    models={
+        "DeepSeek-Coder-6.7B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-6.7b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-6.7b-base",
+        },
+        "DeepSeek-Coder-7B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-7b-base-v1.5",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-7b-base-v1.5",
+        },
+        "DeepSeek-Coder-33B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-33b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-33b-base",
+        },
+        "DeepSeek-Coder-6.7B-Instruct": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-6.7b-instruct",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-6.7b-instruct",
+        },
+        "DeepSeek-Coder-7B-Instruct": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
+        },
+        "DeepSeek-Coder-33B-Instruct": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-33b-instruct",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-33b-instruct",
+        },
+    },
+    template="deepseekcoder",
+)
+
+
+register_model_group(
+    models={
+        "DeepSeek-V2-0628-236B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat-0628",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat-0628",
+        },
+        "DeepSeek-V2.5-236B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2.5",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2.5",
+        },
+        "DeepSeek-V2.5-1210-236B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2.5-1210",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2.5-1210",
+        },
+        "DeepSeek-V3-671B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V3-Base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V3-Base",
+        },
+        "DeepSeek-V3-671B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V3",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V3",
+        },
+        "DeepSeek-V3-0324-671B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V3-0324",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V3-0324",
+        },
+    },
+    template="deepseek3",
+)
+
+
+register_model_group(
+    models={
+        "DeepSeek-R1-1.5B-Distill": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+        },
+        "DeepSeek-R1-7B-Distill": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+        },
+        "DeepSeek-R1-8B-Distill": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+        },
+        "DeepSeek-R1-14B-Distill": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
+        },
+        "DeepSeek-R1-32B-Distill": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+        },
+        "DeepSeek-R1-70B-Distill": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+        },
+        "DeepSeek-R1-671B-Chat-Zero": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Zero",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Zero",
+        },
+        "DeepSeek-R1-671B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1",
+        },
+        "DeepSeek-R1-0528-8B-Distill": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
+        },
+        "DeepSeek-R1-0528-671B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-0528",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-0528",
+        },
+    },
+    template="deepseekr1",
+)
+
+
+register_model_group(
+    models={
+        "Devstral-Small-2507-Instruct": {
+            DownloadSource.DEFAULT: "mistralai/Devstral-Small-2507",
+            DownloadSource.MODELSCOPE: "mistralai/Devstral-Small-2507",
+        },
+    },
+    template="mistral_small",
+)
+
+
+register_model_group(
+    models={
+        "dots.ocr": {
+            DownloadSource.DEFAULT: "rednote-hilab/dots.ocr",
+            DownloadSource.MODELSCOPE: "rednote-hilab/dots.ocr",
+        },
+    },
+    template="dots_ocr",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "ERNIE-4.5-21B-A3B-Thinking": {
+            DownloadSource.DEFAULT: "baidu/ERNIE-4.5-21B-A3B-Thinking",
+            DownloadSource.MODELSCOPE: "PaddlePaddle/ERNIE-4.5-21B-A3B-Thinking",
+        },
+    },
+    template="ernie",
+)
+
+
+register_model_group(
+    models={
+        "ERNIE-4.5-0.3B-Instruct": {
+            DownloadSource.DEFAULT: "baidu/ERNIE-4.5-0.3B-PT",
+            DownloadSource.MODELSCOPE: "PaddlePaddle/ERNIE-4.5-0.3B-PT",
+        },
+        "ERNIE-4.5-21B-A3B-Instruct": {
+            DownloadSource.DEFAULT: "baidu/ERNIE-4.5-21B-A3B-PT",
+            DownloadSource.MODELSCOPE: "PaddlePaddle/ERNIE-4.5-21B-A3B-PT",
+        },
+        "ERNIE-4.5-300B-A47B-Instruct": {
+            DownloadSource.DEFAULT: "baidu/ERNIE-4.5-300B-A47B-PT",
+            DownloadSource.MODELSCOPE: "PaddlePaddle/ERNIE-4.5-300B-A47B-PT",
+        },
+    },
+    template="ernie_nothink",
+)
+
+
+register_model_group(
+    models={
+        "ERNIE-4.5-VL-28B-A3B-Instruct": {
+            DownloadSource.DEFAULT: "baidu/ERNIE-4.5-VL-28B-A3B-PT",
+            DownloadSource.MODELSCOPE: "PaddlePaddle/ERNIE-4.5-VL-28B-A3B-PT",
+        },
+        "ERNIE-4.5-VL-28B-A3B-Thinking": {
+            DownloadSource.DEFAULT: "baidu/ERNIE-4.5-VL-28B-A3B-Thinking",
+            DownloadSource.MODELSCOPE: "PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Thinking",
+        },
+        "ERNIE-4.5-VL-424B-A47B-Instruct": {
+            DownloadSource.DEFAULT: "baidu/ERNIE-4.5-VL-424B-A47B-PT",
+            DownloadSource.MODELSCOPE: "PaddlePaddle/ERNIE-4.5-VL-424B-A47B-PT",
+        },
+    },
+    template="ernie_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "EXAONE-3.0-7.8B-Instruct": {
+            DownloadSource.DEFAULT: "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
+        },
+    },
+    template="exaone",
+)
+
+
+register_model_group(
+    models={
+        "Falcon-7B": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-7b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-7b",
+        },
+        "Falcon-11B": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-11B",
+            DownloadSource.MODELSCOPE: "tiiuae/falcon-11B",
+        },
+        "Falcon-40B": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-40b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-40b",
+        },
+        "Falcon-180B": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-180b",
+            DownloadSource.MODELSCOPE: "modelscope/falcon-180B",
+        },
+        "Falcon-7B-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-7b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-7b-instruct",
+        },
+        "Falcon-40B-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-40b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-40b-instruct",
+        },
+        "Falcon-180B-Chat": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-180b-chat",
+            DownloadSource.MODELSCOPE: "modelscope/falcon-180B-chat",
+        },
+    },
+    template="falcon",
+)
+
+
+register_model_group(
+    models={
+        "Falcon-H1-0.5B-Base": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-0.5B-Base",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-0.5B-Base",
+        },
+        "Falcon-H1-1.5B-Base": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-1.5B-Base",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Base",
+        },
+        "Falcon-H1-1.5B-Deep-Base": {
+            DownloadSource.DEFAULT: "tiuae/Falcon-H1-1.5B-Deep-Base",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Deep-Base",
+        },
+        "Falcon-H1-3B-Base": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-3B-Base",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-3B-Base",
+        },
+        "Falcon-H1-7B-Base": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-7B-Base",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-7B-Base",
+        },
+        "Falcon-H1-34B-Base": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-34B-Base",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-34B-Base",
+        },
+        "Falcon-H1-0.5B-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-0.5B-Instruct",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-0.5B-Instruct",
+        },
+        "Falcon-H1-1.5B-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-1.5B-Instruct",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Instruct",
+        },
+        "Falcon-H1-1.5B-Deep-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-1.5B-Deep-Instruct",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Deep-Instruct",
+        },
+        "Falcon-H1-3B-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-3B-Instruct",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-3B-Instruct",
+        },
+        "Falcon-H1-7B-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-7B-Instruct",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-7B-Instruct",
+        },
+        "Falcon-H1-34B-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-34B-Instruct",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-34B-Instruct",
+        },
+    },
+    template="falcon_h1",
+)
+
+
+register_model_group(
+    models={
+        "Gemma-2B": {
+            DownloadSource.DEFAULT: "google/gemma-2b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/gemma-2b",
+        },
+        "Gemma-7B": {
+            DownloadSource.DEFAULT: "google/gemma-7b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/gemma-2b-it",
+        },
+        "Gemma-2B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-2b-it",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/gemma-7b",
+        },
+        "Gemma-7B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-7b-it",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/gemma-7b-it",
+        },
+        "Gemma-1.1-2B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-1.1-2b-it",
+        },
+        "Gemma-1.1-7B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-1.1-7b-it",
+        },
+    },
+    template="gemma",
+)
+
+
+register_model_group(
+    models={
+        "Gemma-2-2B": {
+            DownloadSource.DEFAULT: "google/gemma-2-2b",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-2b",
+        },
+        "Gemma-2-9B": {
+            DownloadSource.DEFAULT: "google/gemma-2-9b",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-9b",
+        },
+        "Gemma-2-27B": {
+            DownloadSource.DEFAULT: "google/gemma-2-27b",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-27b",
+        },
+        "Gemma-2-2B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-2-2b-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-2b-it",
+            DownloadSource.OPENMIND: "LlamaFactory/gemma-2-2b-it",
+        },
+        "Gemma-2-9B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-2-9b-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-9b-it",
+            DownloadSource.OPENMIND: "LlamaFactory/gemma-2-9b-it",
+        },
+        "Gemma-2-27B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-2-27b-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-27b-it",
+        },
+        "Gemma-3-270M": {
+            DownloadSource.DEFAULT: "google/gemma-3-270m",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-270m",
+        },
+        "Gemma-3-1B": {
+            DownloadSource.DEFAULT: "google/gemma-3-1b-pt",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-1b-pt",
+        },
+        "Gemma-3-270M-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-3-270m-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-270m-it",
+        },
+        "Gemma-3-1B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-3-1b-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-1b-it",
+        },
+        "MedGemma-27B-Instruct": {
+            DownloadSource.DEFAULT: "google/medgemma-27b-text-it",
+            DownloadSource.MODELSCOPE: "google/medgemma-27b-text-it",
+        },
+    },
+    template="gemma2",
+)
+
+
+register_model_group(
+    models={
+        "Gemma-3-4B": {
+            DownloadSource.DEFAULT: "google/gemma-3-4b-pt",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-4b-pt",
+        },
+        "Gemma-3-12B": {
+            DownloadSource.DEFAULT: "google/gemma-3-12b-pt",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-12b-pt",
+        },
+        "Gemma-3-27B": {
+            DownloadSource.DEFAULT: "google/gemma-3-27b-pt",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-27b-pt",
+        },
+        "Gemma-3-4B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-3-4b-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-4b-it",
+        },
+        "Gemma-3-12B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-3-12b-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-12b-it",
+        },
+        "Gemma-3-27B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-3-27b-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-27b-it",
+        },
+        "MedGemma-4B": {
+            DownloadSource.DEFAULT: "google/medgemma-4b-pt",
+            DownloadSource.MODELSCOPE: "google/medgemma-4b-pt",
+        },
+        "MedGemma-4B-Instruct": {
+            DownloadSource.DEFAULT: "google/medgemma-4b-it",
+            DownloadSource.MODELSCOPE: "google/medgemma-4b-it",
+        },
+        "MedGemma-27B-Instruct": {
+            DownloadSource.DEFAULT: "google/medgemma-27b-text-it",
+            DownloadSource.MODELSCOPE: "google/medgemma-27b-text-it",
+        },
+    },
+    template="gemma3",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Gemma-3n-E2B": {
+            DownloadSource.DEFAULT: "google/gemma-3n-E2B",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3n-E2B",
+        },
+        "Gemma-3n-E4B": {
+            DownloadSource.DEFAULT: "google/gemma-3n-E4B",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3n-E4B",
+        },
+        "Gemma-3n-E2B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-3n-E2B-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3n-E2B-it",
+        },
+        "Gemma-3n-E4B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-3n-E4B-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3n-E4B-it",
+        },
+    },
+    template="gemma3n",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "GLM-4-9B": {
+            DownloadSource.DEFAULT: "zai-org/glm-4-9b",
+            DownloadSource.MODELSCOPE: "ZhipuAI/glm-4-9b",
+        },
+        "GLM-4-9B-Chat": {
+            DownloadSource.DEFAULT: "zai-org/glm-4-9b-chat",
+            DownloadSource.MODELSCOPE: "ZhipuAI/glm-4-9b-chat",
+            DownloadSource.OPENMIND: "LlamaFactory/glm-4-9b-chat",
+        },
+        "GLM-4-9B-1M-Chat": {
+            DownloadSource.DEFAULT: "zai-org/glm-4-9b-chat-1m",
+            DownloadSource.MODELSCOPE: "ZhipuAI/glm-4-9b-chat-1m",
+        },
+        "GLM-4-0414-9B-Chat": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4-9B-0414",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-9B-0414",
+        },
+        "GLM-4-0414-32B-Base": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4-32B-Base-0414",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-32B-Base-0414",
+        },
+        "GLM-4-0414-32B-Chat": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4-32B-0414",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-32B-0414",
+        },
+    },
+    template="glm4",
+)
+
+
+register_model_group(
+    models={
+        "GLM-4.1V-9B-Base": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.1V-9B-Base",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.1V-9B-Base",
+        },
+        "GLM-4.1V-9B-Thinking": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.1V-9B-Thinking",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.1V-9B-Thinking",
+        },
+    },
+    template="glm4v",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "GLM-4.5-Air-Base": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.5-Air-Base",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5-Air-Base",
+        },
+        "GLM-4.5-Base": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.5-Base",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5-Base",
+        },
+        "GLM-4.5-Air-Thinking": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.5-Air",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5-Air",
+        },
+        "GLM-4.5-Thinking": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.5",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5",
+        },
+    },
+    template="glm4_moe",
+)
+
+
+register_model_group(
+    models={
+        "GLM-4.5V-Air-Thinking": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.5V",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5V",
+        },
+        "GLM-4.6V": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.6V",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.6V",
+        },
+        "GLM-4.6V-Flash": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.6V-Flash",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.6V-Flash",
+        },
+    },
+    template="glm4_5v",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "GLM-Z1-0414-9B-Chat": {
+            DownloadSource.DEFAULT: "zai-org/GLM-Z1-9B-0414",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-Z1-9B-0414",
+        },
+        "GLM-Z1-0414-32B-Chat": {
+            DownloadSource.DEFAULT: "zai-org/GLM-Z1-32B-0414",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-Z1-32B-0414",
+        },
+    },
+    template="glmz1",
+)
+
+
+register_model_group(
+    models={
+        "GPT-2-Small": {
+            DownloadSource.DEFAULT: "openai-community/gpt2",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/gpt2",
+        },
+        "GPT-2-Medium": {
+            DownloadSource.DEFAULT: "openai-community/gpt2-medium",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/gpt2-medium",
+        },
+        "GPT-2-Large": {
+            DownloadSource.DEFAULT: "openai-community/gpt2-large",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/gpt2-large",
+        },
+        "GPT-2-XL": {
+            DownloadSource.DEFAULT: "openai-community/gpt2-xl",
+            DownloadSource.MODELSCOPE: "goodbai95/GPT2-xl",
+        },
+    },
+)
+
+
+register_model_group(
+    models={
+        "GPT-OSS-20B-Thinking": {
+            DownloadSource.DEFAULT: "openai/gpt-oss-20b",
+            DownloadSource.MODELSCOPE: "openai/gpt-oss-20b",
+        },
+        "GPT-OSS-120B-Thinking": {
+            DownloadSource.DEFAULT: "openai/gpt-oss-120b",
+            DownloadSource.MODELSCOPE: "openai/gpt-oss-120b",
+        },
+    },
+    template="gpt_oss",
+)
+
+
+register_model_group(
+    models={
+        "MiniMax-Text-01-Instruct": {
+            DownloadSource.DEFAULT: "MiniMaxAI/MiniMax-Text-01-hf",
+            DownloadSource.MODELSCOPE: "MiniMaxAI/MiniMax-Text-01",
+        },
+        "MiniMax-M1-40k-Thinking": {
+            DownloadSource.DEFAULT: "MiniMaxAI/MiniMax-M1-40k-hf",
+            DownloadSource.MODELSCOPE: "MiniMaxAI/MiniMax-M1-40k-hf",
+        },
+        "MiniMax-M1-80k-Thinking": {
+            DownloadSource.DEFAULT: "MiniMaxAI/MiniMax-M1-80k-hf",
+            DownloadSource.MODELSCOPE: "MiniMaxAI/MiniMax-M1-80k-hf",
+        },
+    },
+    template="minimax1",
+)
+
+
+register_model_group(
+    models={
+        "MiniMax-M2-Thinking": {
+            DownloadSource.DEFAULT: "MiniMaxAI/MiniMax-M2",
+            DownloadSource.MODELSCOPE: "MiniMaxAI/MiniMax-M2",
+        },
+        "MiniMax-M2.1-Thinking": {
+            DownloadSource.DEFAULT: "MiniMaxAI/MiniMax-M2.1",
+            DownloadSource.MODELSCOPE: "MiniMaxAI/MiniMax-M2.1",
+        },
+    },
+    template="minimax2",
+)
+
+
+register_model_group(
+    models={
+        "Granite-3.0-1B-A400M-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-1b-a400m-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-1b-a400m-base",
+        },
+        "Granite-3.0-3B-A800M-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-3b-a800m-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-3b-a800m-base",
+        },
+        "Granite-3.0-2B-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-2b-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-2b-base",
+        },
+        "Granite-3.0-8B-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-8b-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-8b-base",
+        },
+        "Granite-3.0-1B-A400M-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-1b-a400m-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-1b-a400m-instruct",
+        },
+        "Granite-3.0-3B-A800M-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-3b-a800m-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-3b-a800m-instruct",
+        },
+        "Granite-3.0-2B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-2b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-2b-instruct",
+        },
+        "Granite-3.0-8B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-8b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-8b-instruct",
+        },
+        "Granite-3.1-1B-A400M-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-1b-a400m-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-1b-a400m-base",
+        },
+        "Granite-3.1-3B-A800M-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-3b-a800m-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-3b-a800m-base",
+        },
+        "Granite-3.1-2B-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-2b-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-2b-base",
+        },
+        "Granite-3.1-8B-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-8b-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-8b-base",
+        },
+        "Granite-3.1-1B-A400M-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-1b-a400m-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-1b-a400m-instruct",
+        },
+        "Granite-3.1-3B-A800M-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-3b-a800m-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-3b-a800m-instruct",
+        },
+        "Granite-3.1-2B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-2b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-2b-instruct",
+        },
+        "Granite-3.1-8B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-8b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-8b-instruct",
+        },
+        "Granite-3.2-2B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.2-2b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.2-2b-instruct",
+        },
+        "Granite-3.2-8B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.2-8b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.2-8b-instruct",
+        },
+        "Granite-3.3-2B-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.3-2b-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.3-2b-base",
+        },
+        "Granite-3.3-8B-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.3-8b-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.3-8b-base",
+        },
+        "Granite-3.3-2B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.3-2b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.3-2b-instruct",
+        },
+        "Granite-3.3-8B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.3-8b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.3-8b-instruct",
+        },
+    },
+    template="granite3",
+)
+
+
+register_model_group(
+    models={
+        "Granite-Vision-3.2-2B": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-vision-3.2-2b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-vision-3.2-2b",
+        },
+    },
+    template="granite3_vision",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Granite-4.0-tiny-preview": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-4.0-tiny-preview",
+            DownloadSource.MODELSCOPE: "ibm-granite/granite-4.0-tiny-preview",
+        },
+    },
+    template="granite4",
+)
+
+
+register_model_group(
+    models={
+        "Hunyuan-0.5B-Instruct": {
+            DownloadSource.DEFAULT: "tencent/Hunyuan-0.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Tencent-Hunyuan/Hunyuan-0.5B-Instruct",
+        },
+        "Hunyuan-1.8B-Instruct": {
+            DownloadSource.DEFAULT: "tencent/Hunyuan-1.8B-Instruct",
+            DownloadSource.MODELSCOPE: "Tencent-Hunyuan/Hunyuan-1.8B-Instruct",
+        },
+        "Hunyuan-4B-Instruct": {
+            DownloadSource.DEFAULT: "tencent/Hunyuan-4B-Instruct",
+            DownloadSource.MODELSCOPE: "Tencent-Hunyuan/Hunyuan-4B-Instruct",
+        },
+        "Hunyuan-7B-Instruct": {
+            DownloadSource.DEFAULT: "tencent/Hunyuan-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Tencent-Hunyuan/Hunyuan-7B-Instruct",
+        },
+        "Hunyuan-MT-7B-Instruct": {
+            DownloadSource.DEFAULT: "tencent/Hunyuan-MT-7B",
+            DownloadSource.MODELSCOPE: "Tencent-Hunyuan/Hunyuan-MT-7B",
+        },
+        "HY-MT1.5-7B-Instruct": {
+            DownloadSource.DEFAULT: "tencent/HY-MT1.5-7B",
+            DownloadSource.MODELSCOPE: "Tencent-Hunyuan/HY-MT1.5-7B",
+        },
+        "Hunyuan-A13B-Instruct": {
+            DownloadSource.DEFAULT: "tencent/Hunyuan-A13B-Instruct",
+            DownloadSource.MODELSCOPE: "Tencent-Hunyuan/Hunyuan-A13B-Instruct",
+        },
+    },
+    template="hunyuan",
+)
+
+
+register_model_group(
+    models={
+        "HY-MT1.5-1.8B-Instruct": {
+            DownloadSource.DEFAULT: "tencent/HY-MT1.5-1.8B",
+            DownloadSource.MODELSCOPE: "Tencent-Hunyuan/HY-MT1.5-1.8B",
+        },
+    },
+    template="hunyuan_small",
+)
+
+
+register_model_group(
+    models={
+        "Index-1.9B-Base": {
+            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B",
+            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B",
+        },
+        "Index-1.9B-Base-Pure": {
+            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B-Pure",
+            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B-Pure",
+        },
+        "Index-1.9B-Chat": {
+            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B-Chat",
+            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B-Chat",
+        },
+        "Index-1.9B-Character-Chat": {
+            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B-Character",
+            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B-Character",
+        },
+        "Index-1.9B-Chat-32K": {
+            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B-32K",
+            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B-32K",
+        },
+    },
+    template="index",
+)
+
+
+register_model_group(
+    models={
+        "InternLM2-7B": {
+            DownloadSource.DEFAULT: "internlm/internlm2-7b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-7b",
+        },
+        "InternLM2-20B": {
+            DownloadSource.DEFAULT: "internlm/internlm2-20b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-20b",
+        },
+        "InternLM2-7B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm2-chat-7b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-chat-7b",
+        },
+        "InternLM2-20B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm2-chat-20b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-chat-20b",
+        },
+        "InternLM2.5-1.8B": {
+            DownloadSource.DEFAULT: "internlm/internlm2_5-1_8b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-1_8b",
+            DownloadSource.OPENMIND: "Intern/internlm2_5-1_8b",
+        },
+        "InternLM2.5-7B": {
+            DownloadSource.DEFAULT: "internlm/internlm2_5-7b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-7b",
+        },
+        "InternLM2.5-20B": {
+            DownloadSource.DEFAULT: "internlm/internlm2_5-20b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-20b",
+            DownloadSource.OPENMIND: "Intern/internlm2_5-20b",
+        },
+        "InternLM2.5-1.8B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm2_5-1_8b-chat",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-1_8b-chat",
+            DownloadSource.OPENMIND: "Intern/internlm2_5-1_8b-chat",
+        },
+        "InternLM2.5-7B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm2_5-7b-chat",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-7b-chat",
+            DownloadSource.OPENMIND: "Intern/internlm2_5-7b-chat",
+        },
+        "InternLM2.5-7B-1M-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm2_5-7b-chat-1m",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-7b-chat-1m",
+            DownloadSource.OPENMIND: "Intern/internlm2_5-7b-chat-1m",
+        },
+        "InternLM2.5-20B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm2_5-20b-chat",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-20b-chat",
+            DownloadSource.OPENMIND: "Intern/internlm2_5-20b-chat",
+        },
+        "InternLM3-8B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm3-8b-instruct",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm3-8b-instruct",
+        },
+    },
+    template="intern2",
+)
+
+
+register_model_group(
+    models={
+        "InternVL2.5-2B-MPO": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL2_5-2B-MPO-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL2_5-2B-MPO-hf",
+        },
+        "InternVL2.5-8B-MPO": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL2_5-8B-MPO-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL2_5-8B-MPO-hf",
+        },
+        "InternVL3-1B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-1B-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-1B-hf",
+        },
+        "InternVL3-2B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-2B-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-2B-hf",
+        },
+        "InternVL3-8B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-8B-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-8B-hf",
+        },
+        "InternVL3-14B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-14B-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-14B-hf",
+        },
+        "InternVL3-38B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-38B-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-38B-hf",
+        },
+        "InternVL3-78B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-78B-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-78B-hf",
+        },
+        "InternVL3.5-1B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3_5-1B-HF",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3_5-1B-HF",
+        },
+        "InternVL3.5-2B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3_5-2B-HF",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3_5-2B-HF",
+        },
+        "InternVL3.5-4B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3_5-4B-HF",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3_5-4B-HF",
+        },
+        "InternVL3.5-8B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3_5-8B-HF",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3_5-8B-HF",
+        },
+        "InternVL3.5-14B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3_5-14B-HF",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3_5-14B-HF",
+        },
+        "InternVL3.5-30B-A3B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3_5-30B-A3B-HF",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3_5-30B-A3B-HF",
+        },
+        "InternVL3.5-38B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3_5-38B-HF",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3_5-38B-HF",
+        },
+    },
+    template="intern_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Intern-S1-mini": {
+            DownloadSource.DEFAULT: "internlm/Intern-S1-mini",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/Intern-S1-mini",
+        }
+    },
+    template="intern_s1",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Jamba-v0.1": {
+            DownloadSource.DEFAULT: "ai21labs/Jamba-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Jamba-v0.1",
+        }
+    },
+)
+
+
+register_model_group(
+    models={
+        "Keye-VL-8B-Chat": {
+            DownloadSource.DEFAULT: "Kwai-Keye/Keye-VL-8B-Preview",
+            DownloadSource.MODELSCOPE: "Kwai-Keye/Keye-VL-8B-Preview",
+        },
+    },
+    template="keye_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Kimi-Dev-72B-Instruct": {
+            DownloadSource.DEFAULT: "moonshotai/Kimi-Dev-72B",
+            DownloadSource.MODELSCOPE: "moonshotai/Kimi-Dev-72B",
+        },
+    },
+    template="qwen",
+)
+
+
+register_model_group(
+    models={
+        "Kimi-VL-A3B-Instruct": {
+            DownloadSource.DEFAULT: "moonshotai/Kimi-VL-A3B-Instruct",
+            DownloadSource.MODELSCOPE: "moonshotai/Kimi-VL-A3B-Instruct",
+        },
+        "Kimi-VL-A3B-Thinking": {
+            DownloadSource.DEFAULT: "moonshotai/Kimi-VL-A3B-Thinking",
+            DownloadSource.MODELSCOPE: "moonshotai/Kimi-VL-A3B-Thinking",
+        },
+        "Kimi-VL-A3B-Thinking-2506": {
+            DownloadSource.DEFAULT: "moonshotai/Kimi-VL-A3B-Thinking-2506",
+            DownloadSource.MODELSCOPE: "moonshotai/Kimi-VL-A3B-Thinking-2506",
+        },
+    },
+    template="kimi_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LFM2.5-1.2B": {
+            DownloadSource.DEFAULT: "LiquidAI/LFM2.5-1.2B-Base",
+        },
+        "LFM2.5-1.2B-Instruct": {
+            DownloadSource.DEFAULT: "LiquidAI/LFM2.5-1.2B-Instruct",
+        },
+    },
+    template="lfm2",
+)
+
+
+register_model_group(
+    models={
+        "LFM2.5-VL-1.6B": {
+            DownloadSource.DEFAULT: "LiquidAI/LFM2.5-VL-1.6B",
+        },
+    },
+    template="lfm2_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Llama-7B": {
+            DownloadSource.DEFAULT: "huggyllama/llama-7b",
+            DownloadSource.MODELSCOPE: "skyline2006/llama-7b",
+        },
+        "Llama-13B": {
+            DownloadSource.DEFAULT: "huggyllama/llama-13b",
+            DownloadSource.MODELSCOPE: "skyline2006/llama-13b",
+        },
+        "Llama-30B": {
+            DownloadSource.DEFAULT: "huggyllama/llama-30b",
+            DownloadSource.MODELSCOPE: "skyline2006/llama-30b",
+        },
+        "Llama-65B": {
+            DownloadSource.DEFAULT: "huggyllama/llama-65b",
+            DownloadSource.MODELSCOPE: "skyline2006/llama-65b",
+        },
+    }
+)
+
+
+register_model_group(
+    models={
+        "Llama-2-7B": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-7b-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-7b-ms",
+        },
+        "Llama-2-13B": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-13b-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-13b-ms",
+        },
+        "Llama-2-70B": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-70b-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-70b-ms",
+        },
+        "Llama-2-7B-Chat": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-7b-chat-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-7b-chat-ms",
+        },
+        "Llama-2-13B-Chat": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-13b-chat-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-13b-chat-ms",
+        },
+        "Llama-2-70B-Chat": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-70b-chat-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-70b-chat-ms",
+        },
+    },
+    template="llama2",
+)
+
+
+register_model_group(
+    models={
+        "Llama-3-8B": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-8B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-8B",
+        },
+        "Llama-3-70B": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-70B",
+        },
+        "Llama-3-8B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-8B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-8B-Instruct",
+        },
+        "Llama-3-70B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-70B-Instruct",
+        },
+        "Llama-3-8B-Chinese-Chat": {
+            DownloadSource.DEFAULT: "shenzhi-wang/Llama3-8B-Chinese-Chat",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama3-8B-Chinese-Chat",
+            DownloadSource.OPENMIND: "LlamaFactory/Llama3-Chinese-8B-Instruct",
+        },
+        "Llama-3-70B-Chinese-Chat": {
+            DownloadSource.DEFAULT: "shenzhi-wang/Llama3-70B-Chinese-Chat",
+        },
+        "Llama-3.1-8B": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-8B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-8B",
+        },
+        "Llama-3.1-70B": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-70B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-70B",
+        },
+        "Llama-3.1-405B": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-405B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-405B",
+        },
+        "Llama-3.1-8B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-8B-Instruct",
+        },
+        "Llama-3.1-70B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-70B-Instruct",
+        },
+        "Llama-3.1-405B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-405B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-405B-Instruct",
+        },
+        "Llama-3.1-8B-Chinese-Chat": {
+            DownloadSource.DEFAULT: "shenzhi-wang/Llama3.1-8B-Chinese-Chat",
+            DownloadSource.MODELSCOPE: "XD_AI/Llama3.1-8B-Chinese-Chat",
+        },
+        "Llama-3.1-70B-Chinese-Chat": {
+            DownloadSource.DEFAULT: "shenzhi-wang/Llama3.1-70B-Chinese-Chat",
+            DownloadSource.MODELSCOPE: "XD_AI/Llama3.1-70B-Chinese-Chat",
+        },
+        "Llama-3.2-1B": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-1B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-1B",
+        },
+        "Llama-3.2-3B": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-3B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-3B",
+        },
+        "Llama-3.2-1B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-1B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-1B-Instruct",
+        },
+        "Llama-3.2-3B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-3B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-3B-Instruct",
+        },
+        "Llama-3.3-70B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.3-70B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.3-70B-Instruct",
+        },
+    },
+    template="llama3",
+)
+
+
+register_model_group(
+    models={
+        "Llama-3.2-11B-Vision": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-11B-Vision",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-11B-Vision",
+        },
+        "Llama-3.2-11B-Vision-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-11B-Vision-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-11B-Vision-Instruct",
+        },
+        "Llama-3.2-90B-Vision": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-90B-Vision",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-90B-Vision",
+        },
+        "Llama-3.2-90B-Vision-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-90B-Vision-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-90B-Vision-Instruct",
+        },
+    },
+    template="mllama",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Llama-4-Scout-17B-16E": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-4-Scout-17B-16E",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-4-Scout-17B-16E",
+        },
+        "Llama-4-Scout-17B-16E-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-4-Scout-17B-16E-Instruct",
+        },
+        "Llama-4-Maverick-17B-128E": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-4-Maverick-17B-128E",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-4-Maverick-17B-128E",
+        },
+        "Llama-4-Maverick-17B-128E-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-4-Maverick-17B-128E-Instruct",
+        },
+    },
+    template="llama4",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-1.5-7B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-1.5-7b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-1.5-7b-hf",
+        },
+        "LLaVA-1.5-13B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-1.5-13b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-1.5-13b-hf",
+        },
+    },
+    template="llava",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-7B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-vicuna-7b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-v1.6-vicuna-7b-hf",
+        },
+        "LLaVA-NeXT-13B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-vicuna-13b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-v1.6-vicuna-13b-hf",
+        },
+    },
+    template="llava_next",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-Mistral-7B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-mistral-7b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-v1.6-mistral-7b-hf",
+        },
+    },
+    template="llava_next_mistral",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-Llama3-8B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llama3-llava-next-8b-hf",
+            DownloadSource.MODELSCOPE: "swift/llama3-llava-next-8b-hf",
+        },
+    },
+    template="llava_next_llama3",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-34B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-34b-hf",
+            DownloadSource.MODELSCOPE: "LLM-Research/llava-v1.6-34b-hf",
+        },
+    },
+    template="llava_next_yi",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-72B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-next-72b-hf",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/llava-next-72b-hf",
+        },
+        "LLaVA-NeXT-110B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-next-110b-hf",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/llava-next-110b-hf",
+        },
+    },
+    template="llava_next_qwen",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-Video-7B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-hf",
+            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-7B-hf",
+        },
+        "LLaVA-NeXT-Video-7B-DPO-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-DPO-hf",
+            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-7B-DPO-hf",
+        },
+    },
+    template="llava_next_video",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-Video-7B-32k-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
+            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-7B-32K-hf",
+        },
+    },
+    template="llava_next_video_mistral",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-Video-34B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-hf",
+            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-34B-hf",
+        },
+        "LLaVA-NeXT-Video-34B-DPO-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-DPO-hf",
+        },
+    },
+    template="llava_next_video_yi",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "MiMo-7B-Base": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-7B-Base",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-7B-Base",
+        },
+        "MiMo-7B-Instruct": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-7B-SFT",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-7B-SFT",
+        },
+        "MiMo-7B-Instruct-RL": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-7B-RL",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-7B-RL",
+        },
+        "MiMo-7B-RL-ZERO": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-7B-RL-ZERO",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-7B-RL-ZERO",
+        },
+    },
+    template="mimo",
+)
+
+
+register_model_group(
+    models={
+        "MiMo-V2-Flash-Base": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-V2-Flash-Base",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-V2-Flash-Base",
+        },
+        "MiMo-V2-Flash": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-V2-Flash",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-V2-Flash",
+        },
+    },
+    template="mimo_v2",
+)
+
+
+register_model_group(
+    models={
+        "MiMo-7B-VL-RL": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-VL-7B-RL",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-VL-7B-RL",
+        },
+        "MiMo-VL-7B-RL-2508": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-VL-7B-RL-2508",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-VL-7B-RL-2508",
+        },
+    },
+    template="mimo_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "MiMo-7B-VL-Instruct": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-VL-7B-SFT",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-VL-7B-SFT",
+        },
+        "MiMo-VL-7B-SFT-2508": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-VL-7B-SFT-2508",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-VL-7B-SFT-2508",
+        },
+    },
+    template="qwen2_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "MiniCPM4-0.5B-Chat": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM4-0.5B",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM4-0.5B",
+        },
+        "MiniCPM4-8B-Chat": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM4-8B",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM4-8B",
+        },
+        "MiniCPM4.1-8B-Chat": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM4.1-8B",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM4.1-8B",
+        },
+    },
+    template="cpm4",
+)
+
+
+register_model_group(
+    models={
+        "MiniCPM-o-2.6": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM-o-2_6",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-o-2_6",
+        },
+    },
+    template="minicpm_o",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "MiniCPM-V-2.6": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM-V-2_6",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-V-2_6",
+        },
+        "MiniCPM-V-4": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM-V-4",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-V-4",
+        },
+        "MiniCPM-V-4.5": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM-V-4_5",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-V-4_5",
+        },
+    },
+    template="minicpm_v",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Ministral-8B-Instruct-2410": {
+            DownloadSource.DEFAULT: "mistralai/Ministral-8B-Instruct-2410",
+            DownloadSource.MODELSCOPE: "mistralai/Ministral-8B-Instruct-2410",
+        },
+        "Mistral-Nemo-Base-2407": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-Nemo-Base-2407",
+            DownloadSource.MODELSCOPE: "LLM-Research/Mistral-Nemo-Base-2407",
+        },
+        "Mistral-Nemo-Instruct-2407": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-Nemo-Instruct-2407",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-Nemo-Instruct-2407",
+        },
+    },
+    template="ministral",
+)
+
+
+register_model_group(
+    models={
+        "Mistral-7B-v0.1": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-7B-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-v0.1",
+        },
+        "Mistral-7B-v0.2": {
+            DownloadSource.DEFAULT: "alpindale/Mistral-7B-v0.2-hf",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-v0.2-hf",
+        },
+        "Mistral-7B-v0.3": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-7B-v0.3",
+            DownloadSource.MODELSCOPE: "LLM-Research/mistral-7b-v0.3",
+        },
+        "Mistral-7B-Instruct-v0.1": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.1",
+        },
+        "Mistral-7B-Instruct-v0.2": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.2",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.2",
+        },
+        "Mistral-7B-Instruct-v0.3": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.3",
+            DownloadSource.MODELSCOPE: "LLM-Research/Mistral-7B-Instruct-v0.3",
+        },
+    },
+    template="mistral",
+)
+
+register_model_group(
+    models={
+        "Ministral-3-3B-Base-2512": {
+            DownloadSource.DEFAULT: "mistralai/Ministral-3-3B-Base-2512",
+            DownloadSource.MODELSCOPE: "mistralai/Ministral-3-3B-Base-2512",
+        },
+        "Ministral-3-8B-Base-2512": {
+            DownloadSource.DEFAULT: "mistralai/Ministral-3-8B-Base-2512",
+            DownloadSource.MODELSCOPE: "mistralai/Ministral-3-8B-Base-2512",
+        },
+        "Ministral-3-14B-Base-2512": {
+            DownloadSource.DEFAULT: "mistralai/Ministral-3-14B-Base-2512",
+            DownloadSource.MODELSCOPE: "mistralai/Ministral-3-14B-Base-2512",
+        },
+        "Ministral-3-3B-Instruct-2512": {
+            DownloadSource.DEFAULT: "mistralai/Ministral-3-3B-Instruct-2512",
+            DownloadSource.MODELSCOPE: "mistralai/Ministral-3-3B-Instruct-2512",
+        },
+        "Ministral-3-8B-Instruct-2512": {
+            DownloadSource.DEFAULT: "mistralai/Ministral-3-8B-Instruct-2512",
+            DownloadSource.MODELSCOPE: "mistralai/Ministral-3-8B-Instruct-2512",
+        },
+        "Ministral-3-14B-Instruct-2512": {
+            DownloadSource.DEFAULT: "mistralai/Ministral-3-14B-Instruct-2512",
+            DownloadSource.MODELSCOPE: "mistralai/Ministral-3-14B-Instruct-2512",
+        },
+    },
+    template="ministral3",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Mistral-Small-24B-Base-2501": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-Small-24B-Base-2501",
+            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-24B-Base-2501",
+        },
+        "Mistral-Small-24B-Instruct-2501": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-Small-24B-Instruct-2501",
+            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-24B-Instruct-2501",
+        },
+    },
+    template="mistral_small",
+)
+
+
+register_model_group(
+    models={
+        "Mistral-Small-3.1-24B-Base": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-Small-3.1-24B-Base-2503",
+            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-3.1-24B-Base-2503",
+        },
+        "Mistral-Small-3.1-24B-Instruct": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
+            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
+        },
+        "Mistral-Small-3.2-24B-Instruct": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
+            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
+        },
+    },
+    template="mistral_small",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Mixtral-8x7B-v0.1": {
+            DownloadSource.DEFAULT: "mistralai/Mixtral-8x7B-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x7B-v0.1",
+        },
+        "Mixtral-8x22B-v0.1": {
+            DownloadSource.DEFAULT: "mistralai/Mixtral-8x22B-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x22B-v0.1",
+        },
+        "Mixtral-8x7B-v0.1-Instruct": {
+            DownloadSource.DEFAULT: "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x7B-Instruct-v0.1",
+        },
+        "Mixtral-8x22B-v0.1-Instruct": {
+            DownloadSource.DEFAULT: "mistralai/Mixtral-8x22B-Instruct-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x22B-Instruct-v0.1",
+        },
+    },
+    template="mistral",
+)
+
+
+register_model_group(
+    models={
+        "MobileLLM-R1-140M-Base": {
+            DownloadSource.DEFAULT: "facebook/MobileLLM-R1-140M-base",
+            DownloadSource.MODELSCOPE: "facebook/MobileLLM-R1-140M-base",
+        },
+        "MobileLLM-R1-360M-Base": {
+            DownloadSource.DEFAULT: "facebook/MobileLLM-R1-360M-base",
+            DownloadSource.MODELSCOPE: "facebook/MobileLLM-R1-360M-base",
+        },
+        "MobileLLM-R1-950M-Base": {
+            DownloadSource.DEFAULT: "facebook/MobileLLM-R1-950M-base",
+            DownloadSource.MODELSCOPE: "facebook/MobileLLM-R1-950M-base",
+        },
+        "MobileLLM-R1-140M-Instruct": {
+            DownloadSource.DEFAULT: "facebook/MobileLLM-R1-140M",
+            DownloadSource.MODELSCOPE: "facebook/MobileLLM-R1-140M",
+        },
+        "MobileLLM-R1-360M-Instruct": {
+            DownloadSource.DEFAULT: "facebook/MobileLLM-R1-360M",
+            DownloadSource.MODELSCOPE: "facebook/MobileLLM-R1-360M",
+        },
+        "MobileLLM-R1-950M-Instruct": {
+            DownloadSource.DEFAULT: "facebook/MobileLLM-R1-950M",
+            DownloadSource.MODELSCOPE: "facebook/MobileLLM-R1-950M",
+        },
+    },
+    template="llama3",
+)
+
+
+register_model_group(
+    models={
+        "Moonlight-16B-A3B": {
+            DownloadSource.DEFAULT: "moonshotai/Moonlight-16B-A3B",
+            DownloadSource.MODELSCOPE: "moonshotai/Moonlight-16B-A3B",
+        },
+        "Moonlight-16B-A3B-Instruct": {
+            DownloadSource.DEFAULT: "moonshotai/Moonlight-16B-A3B-Instruct",
+            DownloadSource.MODELSCOPE: "moonshotai/Moonlight-16B-A3B-Instruct",
+        },
+    },
+    template="moonlight",
+)
+
+
+register_model_group(
+    models={
+        "OLMo-1B": {
+            DownloadSource.DEFAULT: "allenai/OLMo-1B-hf",
+        },
+        "OLMo-7B": {
+            DownloadSource.DEFAULT: "allenai/OLMo-7B-hf",
+        },
+        "OLMo-7B-Chat": {
+            DownloadSource.DEFAULT: "ssec-uw/OLMo-7B-Instruct-hf",
+        },
+        "OLMo-1.7-7B": {
+            DownloadSource.DEFAULT: "allenai/OLMo-1.7-7B-hf",
+        },
+    },
+)
+
+
+register_model_group(
+    models={
+        "OpenChat3.5-7B-Chat": {
+            DownloadSource.DEFAULT: "openchat/openchat-3.5-0106",
+            DownloadSource.MODELSCOPE: "xcwzxcwz/openchat-3.5-0106",
+        }
+    },
+    template="openchat",
+)
+
+
+register_model_group(
+    models={
+        "OpenChat3.6-8B-Chat": {
+            DownloadSource.DEFAULT: "openchat/openchat-3.6-8b-20240522",
+        }
+    },
+    template="openchat-3.6",
+)
+
+
+register_model_group(
+    models={
+        "OpenCoder-1.5B-Base": {
+            DownloadSource.DEFAULT: "infly/OpenCoder-1.5B-Base",
+            DownloadSource.MODELSCOPE: "infly/OpenCoder-1.5B-Base",
+        },
+        "OpenCoder-8B-Base": {
+            DownloadSource.DEFAULT: "infly/OpenCoder-8B-Base",
+            DownloadSource.MODELSCOPE: "infly/OpenCoder-8B-Base",
+        },
+        "OpenCoder-1.5B-Instruct": {
+            DownloadSource.DEFAULT: "infly/OpenCoder-1.5B-Instruct",
+            DownloadSource.MODELSCOPE: "infly/OpenCoder-1.5B-Instruct",
+        },
+        "OpenCoder-8B-Instruct": {
+            DownloadSource.DEFAULT: "infly/OpenCoder-8B-Instruct",
+            DownloadSource.MODELSCOPE: "infly/OpenCoder-8B-Instruct",
+        },
+    },
+    template="opencoder",
+)
+
+
+register_model_group(
+    models={
+        "PaliGemma-3B-pt-224": {
+            DownloadSource.DEFAULT: "google/paligemma-3b-pt-224",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-pt-224",
+        },
+        "PaliGemma-3B-pt-448": {
+            DownloadSource.DEFAULT: "google/paligemma-3b-pt-448",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-pt-448",
+        },
+        "PaliGemma-3B-pt-896": {
+            DownloadSource.DEFAULT: "google/paligemma-3b-pt-896",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-pt-896",
+        },
+        "PaliGemma-3B-mix-224": {
+            DownloadSource.DEFAULT: "google/paligemma-3b-mix-224",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-mix-224",
+        },
+        "PaliGemma-3B-mix-448": {
+            DownloadSource.DEFAULT: "google/paligemma-3b-mix-448",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-mix-448",
+        },
+    },
+    template="paligemma",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "PaliGemma2-3B-pt-224": {
+            DownloadSource.DEFAULT: "google/paligemma2-3b-pt-224",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-3b-pt-224",
+        },
+        "PaliGemma2-3B-pt-448": {
+            DownloadSource.DEFAULT: "google/paligemma2-3b-pt-448",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-3b-pt-448",
+        },
+        "PaliGemma2-3B-pt-896": {
+            DownloadSource.DEFAULT: "google/paligemma2-3b-pt-896",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-3b-pt-896",
+        },
+        "PaliGemma2-10B-pt-224": {
+            DownloadSource.DEFAULT: "google/paligemma2-10b-pt-224",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-10b-pt-224",
+        },
+        "PaliGemma2-10B-pt-448": {
+            DownloadSource.DEFAULT: "google/paligemma2-10b-pt-448",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-10b-pt-448",
+        },
+        "PaliGemma2-10B-pt-896": {
+            DownloadSource.DEFAULT: "google/paligemma2-10b-pt-896",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-10b-pt-896",
+        },
+        "PaliGemma2-28B-pt-224": {
+            DownloadSource.DEFAULT: "google/paligemma2-28b-pt-224",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-28b-pt-224",
+        },
+        "PaliGemma2-28B-pt-448": {
+            DownloadSource.DEFAULT: "google/paligemma2-28b-pt-448",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-28b-pt-448",
+        },
+        "PaliGemma2-28B-pt-896": {
+            DownloadSource.DEFAULT: "google/paligemma2-28b-pt-896",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-28b-pt-896",
+        },
+        "PaliGemma2-3B-mix-224": {
+            DownloadSource.DEFAULT: "google/paligemma2-3b-mix-224",
+            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-3b-mix-224-bf16",
+        },
+        "PaliGemma2-3B-mix-448": {
+            DownloadSource.DEFAULT: "google/paligemma2-3b-mix-448",
+            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-3b-mix-448-bf16",
+        },
+        "PaliGemma2-10B-mix-224": {
+            DownloadSource.DEFAULT: "google/paligemma2-10b-mix-224",
+            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-10b-mix-224-bf16",
+        },
+        "PaliGemma2-10B-mix-448": {
+            DownloadSource.DEFAULT: "google/paligemma2-10b-mix-448",
+            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-10b-mix-448-bf16",
+        },
+        "PaliGemma2-28B-mix-224": {
+            DownloadSource.DEFAULT: "google/paligemma2-28b-mix-224",
+            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-28b-mix-224-bf16",
+        },
+        "PaliGemma2-28B-mix-448": {
+            DownloadSource.DEFAULT: "google/paligemma2-28b-mix-448",
+            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-28b-mix-448-bf16",
+        },
+    },
+    template="paligemma",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Phi-3-4B-4k-Instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3-mini-4k-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-mini-4k-instruct",
+        },
+        "Phi-3-4B-128k-Instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3-mini-128k-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-mini-128k-instruct",
+        },
+        "Phi-3-14B-8k-Instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3-medium-4k-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-medium-4k-instruct",
+        },
+        "Phi-3-14B-128k-Instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3-medium-128k-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-medium-128k-instruct",
+        },
+        "Phi-3.5-4B-instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3.5-mini-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3.5-mini-instruct",
+        },
+        "Phi-3.5-MoE-42B-A6.6B-instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3.5-MoE-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3.5-MoE-instruct",
+        },
+    },
+    template="phi",
+)
+
+
+register_model_group(
+    models={
+        "Phi-3-7B-8k-Instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3-small-8k-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-small-8k-instruct",
+        },
+        "Phi-3-7B-128k-Instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3-small-128k-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-small-128k-instruct",
+        },
+    },
+    template="phi_small",
+)
+
+
+register_model_group(
+    models={
+        "Phi-4-14B-Instruct": {
+            DownloadSource.DEFAULT: "microsoft/phi-4",
+            DownloadSource.MODELSCOPE: "LLM-Research/phi-4",
+        },
+    },
+    template="phi4",
+)
+
+register_model_group(
+    models={
+        "Phi-4-3.8B-instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-4-mini-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-4-mini-instruct",
+        },
+    },
+    template="phi4_mini",
+)
+
+register_model_group(
+    models={
+        "Pixtral-12B": {
+            DownloadSource.DEFAULT: "mistral-community/pixtral-12b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b",
+        }
+    },
+    template="pixtral",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Qwen2-0.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B",
+        },
+        "Qwen2-1.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B",
+        },
+        "Qwen2-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B",
+        },
+        "Qwen2-72B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-72B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B",
+        },
+        "Qwen2-MoE-57B-A14B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-57B-A14B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-57B-A14B",
+        },
+        "Qwen2-0.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B-Instruct",
+            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-0.5B-Instruct",
+        },
+        "Qwen2-1.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B-Instruct",
+            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-1.5B-Instruct",
+        },
+        "Qwen2-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B-Instruct",
+            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-7B-Instruct",
+        },
+        "Qwen2-72B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B-Instruct",
+        },
+        "Qwen2-MoE-57B-A14B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-57B-A14B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-57B-A14B-Instruct",
+        },
+        "Qwen2-0.5B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2-0.5B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-0.5B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B-Instruct-AWQ",
+        },
+        "Qwen2-1.5B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2-1.5B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-1.5B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B-Instruct-AWQ",
+        },
+        "Qwen2-7B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2-7B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-7B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B-Instruct-AWQ",
+        },
+        "Qwen2-72B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2-72B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-72B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B-Instruct-AWQ",
+        },
+        "Qwen2-57B-A14B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-Math-1.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-1.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-1.5B",
+        },
+        "Qwen2-Math-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-7B",
+        },
+        "Qwen2-Math-72B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-72B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-72B",
+        },
+        "Qwen2-Math-1.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-1.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-1.5B-Instruct",
+        },
+        "Qwen2-Math-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-7B-Instruct",
+        },
+        "Qwen2-Math-72B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-72B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-72B-Instruct",
+        },
+    },
+    template="qwen",
+)
+
+
+register_model_group(
+    models={
+        "Qwen2.5-0.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B",
+        },
+        "Qwen2.5-1.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B",
+        },
+        "Qwen2.5-3B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B",
+        },
+        "Qwen2.5-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B",
+        },
+        "Qwen2.5-14B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B",
+        },
+        "Qwen2.5-32B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B",
+        },
+        "Qwen2.5-72B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B",
+        },
+        "Qwen2.5-0.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B-Instruct",
+        },
+        "Qwen2.5-1.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B-Instruct",
+        },
+        "Qwen2.5-3B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B-Instruct",
+        },
+        "Qwen2.5-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct",
+        },
+        "Qwen2.5-14B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct",
+        },
+        "Qwen2.5-32B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B-Instruct",
+        },
+        "Qwen2.5-72B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B-Instruct",
+        },
+        "Qwen2.5-7B-Instruct-1M": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct-1M",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct-1M",
+        },
+        "Qwen2.5-14B-Instruct-1M": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct-1M",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct-1M",
+        },
+        "Qwen2.5-0.5B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2.5-0.5B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2.5-0.5B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B-Instruct-AWQ",
+        },
+        "Qwen2.5-1.5B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2.5-1.5B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2.5-1.5B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B-Instruct-AWQ",
+        },
+        "Qwen2.5-3B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2.5-3B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2.5-3B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B-Instruct-AWQ",
+        },
+        "Qwen2.5-7B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2.5-7B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2.5-7B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct-AWQ",
+        },
+        "Qwen2.5-14B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2.5-14B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2.5-14B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct-AWQ",
+        },
+        "Qwen2.5-32B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2.5-32B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2.5-32B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B-Instruct-AWQ",
+        },
+        "Qwen2.5-72B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2.5-72B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2.5-72B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B-Instruct-AWQ",
+        },
+        "Qwen2.5-Coder-0.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-0.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-0.5B",
+        },
+        "Qwen2.5-Coder-1.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-1.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-1.5B",
+        },
+        "Qwen2.5-Coder-3B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-3B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-3B",
+        },
+        "Qwen2.5-Coder-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-7B",
+        },
+        "Qwen2.5-Coder-14B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-14B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-14B",
+        },
+        "Qwen2.5-Coder-32B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-32B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-32B",
+        },
+        "Qwen2.5-Coder-0.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-0.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-0.5B-Instruct",
+        },
+        "Qwen2.5-Coder-1.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-1.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-1.5B-Instruct",
+        },
+        "Qwen2.5-Coder-3B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-3B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-3B-Instruct",
+        },
+        "Qwen2.5-Coder-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-7B-Instruct",
+        },
+        "Qwen2.5-Coder-14B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-14B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-14B-Instruct",
+        },
+        "Qwen2.5-Coder-32B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-32B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-32B-Instruct",
+        },
+        "Qwen2.5-Math-1.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-1.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Math-1.5B",
+        },
+        "Qwen2.5-Math-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Math-7B",
+        },
+        "Qwen2.5-Math-72B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-72B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Math-72B",
+        },
+        "Qwen2.5-Math-1.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-1.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-1.5B-Instruct",
+        },
+        "Qwen2.5-Math-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-7B-Instruct",
+        },
+        "Qwen2.5-Math-72B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-72B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-72B-Instruct",
+        },
+        "QwQ-32B-Preview-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/QwQ-32B-Preview",
+            DownloadSource.MODELSCOPE: "Qwen/QwQ-32B-Preview",
+        },
+        "QwQ-32B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/QwQ-32B",
+            DownloadSource.MODELSCOPE: "Qwen/QwQ-32B",
+        },
+    },
+    template="qwen",
+)
+
+
+register_model_group(
+    models={
+        "Qwen3-0.6B-Base": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-0.6B-Base",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-0.6B-Base",
+        },
+        "Qwen3-1.7B-Base": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-1.7B-Base",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-1.7B-Base",
+        },
+        "Qwen3-4B-Base": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-4B-Base",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-Base",
+        },
+        "Qwen3-8B-Base": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-8B-Base",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-8B-Base",
+        },
+        "Qwen3-14B-Base": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-14B-Base",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-14B-Base",
+        },
+        "Qwen3-30B-A3B-Base": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-Base",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-Base",
+        },
+        "Qwen3-0.6B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-0.6B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-0.6B",
+        },
+        "Qwen3-1.7B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-1.7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-1.7B",
+        },
+        "Qwen3-4B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-4B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B",
+        },
+        "Qwen3-4B-Thinking-2507": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-4B-Thinking-2507",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-Thinking-2507",
+        },
+        "Qwen3-8B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-8B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-8B",
+        },
+        "Qwen3-14B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-14B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-14B",
+        },
+        "Qwen3-32B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-32B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-32B",
+        },
+        "Qwen3-30B-A3B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B",
+        },
+        "Qwen3-30B-A3B-Thinking-2507": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-Thinking-2507",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-Thinking-2507",
+        },
+        "Qwen3-235B-A22B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B",
+        },
+        "Qwen3-235B-A22B-Thinking-2507": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-Thinking-2507",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-Thinking-2507",
+        },
+        "Qwen3-0.6B-Thinking-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-0.6B-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-0.6B-GPTQ-Int8",
+        },
+        "Qwen3-1.7B-Thinking-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-1.7B-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-1.7B-GPTQ-Int8",
+        },
+        "Qwen3-4B-Thinking-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-4B-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-AWQ",
+        },
+        "Qwen3-8B-Thinking-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-8B-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-8B-AWQ",
+        },
+        "Qwen3-14B-Thinking-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-14B-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-14B-AWQ",
+        },
+        "Qwen3-32B-Thinking-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-32B-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-32B-AWQ",
+        },
+        "Qwen3-30B-A3B-Thinking-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-GPTQ-Int4",
+        },
+        "Qwen3-235B-A22B-Thinking-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-GPTQ-Int4",
+        },
+        "Qwen/Qwen3-Next-80B-A3B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-Next-80B-A3B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-Next-80B-A3B-Thinking",
+        },
+    },
+    template="qwen3",
+)
+
+
+register_model_group(
+    models={
+        "Qwen3-4B-Instruct-2507": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-4B-Instruct-2507",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-Instruct-2507",
+        },
+        "Qwen3-30B-A3B-Instruct-2507": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-Instruct-2507",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-Instruct-2507",
+        },
+        "Qwen3-235B-A22B-Instruct-2507": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-Instruct-2507",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-Instruct-2507",
+        },
+        "Qwen3-Next-80B-A3B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-Next-80B-A3B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-Next-80B-A3B-Instruct",
+        },
+    },
+    template="qwen3_nothink",
+)
+
+
+register_model_group(
+    models={
+        "Qwen2-Audio-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Audio-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Audio-7B",
+        },
+        "Qwen2-Audio-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Audio-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Audio-7B-Instruct",
+        },
+    },
+    template="qwen2_audio",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Qwen2.5-Omni-3B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Omni-3B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Omni-3B",
+        },
+        "Qwen2.5-Omni-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Omni-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Omni-7B",
+        },
+        "Qwen2.5-Omni-7B-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Omni-7B-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Omni-7B-GPTQ-Int4",
+        },
+        "Qwen2.5-Omni-7B-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Omni-7B-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Omni-7B-AWQ",
+        },
+    },
+    template="qwen2_omni",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Qwen3-Omni-30B-A3B-Captioner": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Captioner",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Captioner",
+        },
+        "Qwen3-Omni-30B-A3B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+        },
+    },
+    template="qwen3_omni_nothink",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Qwen3-Omni-30B-A3B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Thinking",
+        },
+    },
+    template="qwen3_omni",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Qwen2-VL-2B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B",
+        },
+        "Qwen2-VL-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B",
+        },
+        "Qwen2-VL-72B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B",
+        },
+        "Qwen2-VL-2B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B-Instruct",
+            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-VL-2B-Instruct",
+        },
+        "Qwen2-VL-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B-Instruct",
+            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-VL-7B-Instruct",
+        },
+        "Qwen2-VL-72B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B-Instruct",
+        },
+        "Qwen2-VL-2B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2-VL-2B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-VL-2B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B-Instruct-AWQ",
+        },
+        "Qwen2-VL-7B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2-VL-7B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-VL-7B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B-Instruct-AWQ",
+        },
+        "Qwen2-VL-72B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2-VL-72B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-VL-72B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B-Instruct-AWQ",
+        },
+        "QVQ-72B-Preview": {
+            DownloadSource.DEFAULT: "Qwen/QVQ-72B-Preview",
+            DownloadSource.MODELSCOPE: "Qwen/QVQ-72B-Preview",
+        },
+        "Qwen2.5-VL-3B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-3B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-3B-Instruct",
+        },
+        "Qwen2.5-VL-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-7B-Instruct",
+        },
+        "Qwen2.5-VL-32B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-32B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-32B-Instruct",
+        },
+        "Qwen2.5-VL-72B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-72B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-72B-Instruct",
+        },
+        "Qwen2.5-VL-3B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-3B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-3B-Instruct-AWQ",
+        },
+        "Qwen2.5-VL-7B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
+        },
+        "Qwen2.5-VL-72B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-72B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-72B-Instruct-AWQ",
+        },
+    },
+    template="qwen2_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Qwen3-VL-2B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-2B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-2B-Instruct",
+        },
+        "Qwen3-VL-4B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-4B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-4B-Instruct",
+        },
+        "Qwen3-VL-8B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-8B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-8B-Instruct",
+        },
+        "Qwen3-VL-32B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-32B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-32B-Instruct",
+        },
+        "Qwen3-VL-30B-A3B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-30B-A3B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-30B-A3B-Instruct",
+        },
+        "Qwen3-VL-235B-A22B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Instruct",
+        },
+    },
+    template="qwen3_vl_nothink",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Qwen3-VL-2B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-2B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-2B-Thinking",
+        },
+        "Qwen3-VL-4B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-4B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-4B-Thinking",
+        },
+        "Qwen3-VL-8B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-8B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-8B-Thinking",
+        },
+        "Qwen3-VL-32B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-32B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-32B-Thinking",
+        },
+        "Qwen3-VL-30B-A3B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-30B-A3B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-30B-A3B-Thinking",
+        },
+        "Qwen3-VL-235B-A22B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Thinking",
+        },
+    },
+    template="qwen3_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Seed-Coder-8B-Base": {
+            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Base",
+            DownloadSource.MODELSCOPE: "ByteDance-Seed/Seed-Coder-8B-Base",
+        },
+        "Seed-Coder-8B-Instruct": {
+            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Instruct",
+            DownloadSource.MODELSCOPE: "ByteDance-Seed/Seed-Coder-8B-Instruct",
+        },
+        "Seed-Coder-8B-Thinking": {
+            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16",
+            DownloadSource.MODELSCOPE: "ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16",
+        },
+    },
+    template="seed_coder",
+)
+
+
+register_model_group(
+    models={
+        "Seed-OSS-36B-Base": {
+            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-OSS-36B-Base",
+            DownloadSource.MODELSCOPE: "ByteDance-Seed/Seed-OSS-36B-Base",
+        },
+        "Seed-OSS-36B-Base-woSyn": {
+            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-OSS-36B-Base-woSyn",
+            DownloadSource.MODELSCOPE: "ByteDance-Seed/Seed-OSS-36B-Base-woSyn",
+        },
+        "Seed-OSS-36B-Instruct": {
+            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-OSS-36B-Instruct",
+            DownloadSource.MODELSCOPE: "ByteDance-Seed/Seed-OSS-36B-Instruct",
+        },
+    },
+    template="seed_oss",
+)
+
+
+register_model_group(
+    models={
+        "SmolLM-135M": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-135M",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-135M",
+        },
+        "SmolLM-360M": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-360M",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-360M",
+        },
+        "SmolLM-1.7B": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-1.7B",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-1.7B",
+        },
+        "SmolLM-135M-Instruct": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-135M-Instruct",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-135M-Instruct",
+        },
+        "SmolLM-360M-Instruct": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-360M-Instruct",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-360M-Instruct",
+        },
+        "SmolLM-1.7B-Instruct": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-1.7B-Instruct",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-1.7B-Instruct",
+        },
+    },
+    template="smollm",
+)
+
+
+register_model_group(
+    models={
+        "SmolLM2-135M": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-135M",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-135M",
+        },
+        "SmolLM2-360M": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-360M",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-360M",
+        },
+        "SmolLM2-1.7B": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-1.7B",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-1.7B",
+        },
+        "SmolLM2-135M-Instruct": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-135M-Instruct",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-135M-Instruct",
+        },
+        "SmolLM2-360M-Instruct": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-360M-Instruct",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-360M-Instruct",
+        },
+        "SmolLM2-1.7B-Instruct": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-1.7B-Instruct",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-1.7B-Instruct",
+        },
+    },
+    template="smollm2",
+)
+
+
+register_model_group(
+    models={
+        "SOLAR-10.7B-v1.0": {
+            DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-v1.0",
+        },
+        "SOLAR-10.7B-Instruct-v1.0": {
+            DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-Instruct-v1.0",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/SOLAR-10.7B-Instruct-v1.0",
+        },
+    },
+    template="solar",
+)
+
+
+register_model_group(
+    models={
+        "StarCoder2-3B": {
+            DownloadSource.DEFAULT: "bigcode/starcoder2-3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-3b",
+        },
+        "StarCoder2-7B": {
+            DownloadSource.DEFAULT: "bigcode/starcoder2-7b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-7b",
+        },
+        "StarCoder2-15B": {
+            DownloadSource.DEFAULT: "bigcode/starcoder2-15b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-15b",
+        },
+    }
+)
+
+
+register_model_group(
+    models={
+        "TeleChat2-3B-Chat": {
+            DownloadSource.DEFAULT: "Tele-AI/TeleChat2-3B",
+            DownloadSource.MODELSCOPE: "TeleAI/TeleChat2-3B",
+        },
+        "TeleChat2-7B-Chat": {
+            DownloadSource.DEFAULT: "Tele-AI/TeleChat2-7B",
+            DownloadSource.MODELSCOPE: "TeleAI/TeleChat2-7B",
+        },
+        "TeleChat2-115B-Chat": {
+            DownloadSource.DEFAULT: "Tele-AI/TeleChat2-115B",
+            DownloadSource.MODELSCOPE: "TeleAI/TeleChat2-115B",
+        },
+        "TeleChat2.5-35B-Chat": {
+            DownloadSource.DEFAULT: "Tele-AI/TeleChat2.5-35B",
+            DownloadSource.MODELSCOPE: "TeleAI/TeleChat2-35B-Nov",
+        },
+    },
+    template="telechat2",
+)
+
+
+register_model_group(
+    models={
+        "VibeThinker-1.5B": {
+            DownloadSource.DEFAULT: "WeiboAI/VibeThinker-1.5B",
+            DownloadSource.MODELSCOPE: "WeiboAI/VibeThinker-1.5B",
+        },
+    },
+    template="qwen3",
+)
+
+
+register_model_group(
+    models={
+        "Vicuna-v1.5-7B-Chat": {
+            DownloadSource.DEFAULT: "lmsys/vicuna-7b-v1.5",
+            DownloadSource.MODELSCOPE: "Xorbits/vicuna-7b-v1.5",
+        },
+        "Vicuna-v1.5-13B-Chat": {
+            DownloadSource.DEFAULT: "lmsys/vicuna-13b-v1.5",
+            DownloadSource.MODELSCOPE: "Xorbits/vicuna-13b-v1.5",
+        },
+    },
+    template="vicuna",
+)
+
+
+register_model_group(
+    models={
+        "Video-LLaVA-7B-Chat": {
+            DownloadSource.DEFAULT: "LanguageBind/Video-LLaVA-7B-hf",
+        },
+    },
+    template="video_llava",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "XuanYuan-6B": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B",
+        },
+        "XuanYuan-70B": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B",
+        },
+        "XuanYuan2-70B": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B",
+        },
+        "XuanYuan-6B-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat",
+        },
+        "XuanYuan-70B-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat",
+        },
+        "XuanYuan2-70B-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat",
+        },
+        "XuanYuan-6B-Chat-8bit": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat-8bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat-8bit",
+        },
+        "XuanYuan-6B-Chat-4bit": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat-4bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat-4bit",
+        },
+        "XuanYuan-70B-Chat-8bit": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat-8bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat-8bit",
+        },
+        "XuanYuan-70B-Chat-4bit": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat-4bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat-4bit",
+        },
+        "XuanYuan2-70B-Chat-8bit": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat-8bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat-8bit",
+        },
+        "XuanYuan2-70B-Chat-4bit": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat-4bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat-4bit",
+        },
+    },
+    template="xuanyuan",
+)
+
+
+register_model_group(
+    models={
+        "Yi-6B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-6B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-6B",
+        },
+        "Yi-9B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-9B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-9B",
+        },
+        "Yi-34B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-34B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-34B",
+        },
+        "Yi-6B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-6B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-6B-Chat",
+        },
+        "Yi-34B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat",
+        },
+        "Yi-6B-Chat-8bits": {
+            DownloadSource.DEFAULT: "01-ai/Yi-6B-Chat-8bits",
+            DownloadSource.MODELSCOPE: "01ai/Yi-6B-Chat-8bits",
+        },
+        "Yi-6B-Chat-4bits": {
+            DownloadSource.DEFAULT: "01-ai/Yi-6B-Chat-4bits",
+            DownloadSource.MODELSCOPE: "01ai/Yi-6B-Chat-4bits",
+        },
+        "Yi-34B-Chat-8bits": {
+            DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat-8bits",
+            DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat-8bits",
+        },
+        "Yi-34B-Chat-4bits": {
+            DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat-4bits",
+            DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat-4bits",
+        },
+        "Yi-1.5-6B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-6B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-6B",
+        },
+        "Yi-1.5-9B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-9B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-9B",
+        },
+        "Yi-1.5-34B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-34B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-34B",
+        },
+        "Yi-1.5-6B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-6B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-6B-Chat",
+            DownloadSource.OPENMIND: "LlamaFactory/Yi-1.5-6B-Chat",
+        },
+        "Yi-1.5-9B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-9B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-9B-Chat",
+        },
+        "Yi-1.5-34B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-34B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-34B-Chat",
+        },
+        "Yi-Coder-1.5B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-Coder-1.5B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-Coder-1.5B",
+        },
+        "Yi-Coder-9B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-Coder-9B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-Coder-9B",
+        },
+        "Yi-Coder-1.5B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-Coder-1.5B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-Coder-1.5B-Chat",
+        },
+        "Yi-Coder-9B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-Coder-9B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-Coder-9B-Chat",
+        },
+    },
+    template="yi",
+)
+
+
+register_model_group(
+    models={
+        "Yi-VL-6B-Chat": {
+            DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-6B-hf",
+        },
+        "Yi-VL-34B-Chat": {
+            DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-34B-hf",
+        },
+    },
+    template="yi_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Youtu-LLM-2B-Instruct": {
+            DownloadSource.DEFAULT: "tencent/Youtu-LLM-2B",
+            DownloadSource.MODELSCOPE: "Tencent-YouTu-Research/Youtu-LLM-2B",
+        },
+        "Youtu-LLM-2B-Base": {
+            DownloadSource.DEFAULT: "tencent/Youtu-LLM-2B-Base",
+            DownloadSource.MODELSCOPE: "Tencent-YouTu-Research/Youtu-LLM-2B-Base",
+        },
+    },
+    template="youtu",
+)
+
+
+register_model_group(
+    models={
+        "Youtu-VL-4B-Instruct": {
+            DownloadSource.DEFAULT: "tencent/Youtu-VL-4B-Instruct",
+            DownloadSource.MODELSCOPE: "Tencent-YouTu-Research/Youtu-VL-4B-Instruct",
+        },
+    },
+    template="youtu_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Yuan2-2B-Chat": {
+            DownloadSource.DEFAULT: "IEITYuan/Yuan2-2B-hf",
+            DownloadSource.MODELSCOPE: "YuanLLM/Yuan2.0-2B-hf",
+        },
+        "Yuan2-51B-Chat": {
+            DownloadSource.DEFAULT: "IEITYuan/Yuan2-51B-hf",
+            DownloadSource.MODELSCOPE: "YuanLLM/Yuan2.0-51B-hf",
+        },
+        "Yuan2-102B-Chat": {
+            DownloadSource.DEFAULT: "IEITYuan/Yuan2-102B-hf",
+            DownloadSource.MODELSCOPE: "YuanLLM/Yuan2.0-102B-hf",
+        },
+    },
+    template="yuan",
+)
+
+
+register_model_group(
+    models={
+        "Zephyr-7B-Alpha-Chat": {
+            DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-7b-alpha",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/zephyr-7b-alpha",
+        },
+        "Zephyr-7B-Beta-Chat": {
+            DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-7b-beta",
+            DownloadSource.MODELSCOPE: "modelscope/zephyr-7b-beta",
+        },
+        "Zephyr-141B-ORPO-Chat": {
+            DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1",
+        },
+    },
+    template="zephyr",
+)
diff --git a/LlamaFactory/src/llamafactory/extras/env.py b/LlamaFactory/src/llamafactory/extras/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e3d68c38c0f964da88ecfd250ad115ad4d2238e
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/extras/env.py
@@ -0,0 +1,102 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/commands/env.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from collections import OrderedDict
+
+
+VERSION = "0.9.5.dev0"
+
+
+def print_env() -> None:
+    import os
+    import platform
+
+    import accelerate
+    import datasets
+    import peft
+    import torch
+    import transformers
+    from transformers.utils import is_torch_cuda_available, is_torch_npu_available
+
+    info = OrderedDict(
+        {
+            "`llamafactory` version": VERSION,
+            "Platform": platform.platform(),
+            "Python version": platform.python_version(),
+            "PyTorch version": torch.__version__,
+            "Transformers version": transformers.__version__,
+            "Datasets version": datasets.__version__,
+            "Accelerate version": accelerate.__version__,
+            "PEFT version": peft.__version__,
+        }
+    )
+
+    if is_torch_cuda_available():
+        info["PyTorch version"] += " (GPU)"
+        info["GPU type"] = torch.cuda.get_device_name()
+        info["GPU number"] = torch.cuda.device_count()
+        info["GPU memory"] = f"{torch.cuda.mem_get_info()[1] / (1024**3):.2f}GB"
+
+    if is_torch_npu_available():
+        info["PyTorch version"] += " (NPU)"
+        info["NPU type"] = torch.npu.get_device_name()
+        info["CANN version"] = torch.version.cann
+
+    try:
+        import trl  # type: ignore
+
+        info["TRL version"] = trl.__version__
+    except Exception:
+        pass
+
+    try:
+        import deepspeed  # type: ignore
+
+        info["DeepSpeed version"] = deepspeed.__version__
+    except Exception:
+        pass
+
+    try:
+        import bitsandbytes  # type: ignore
+
+        info["Bitsandbytes version"] = bitsandbytes.__version__
+    except Exception:
+        pass
+
+    try:
+        import vllm
+
+        info["vLLM version"] = vllm.__version__
+    except Exception:
+        pass
+
+    try:
+        import subprocess
+
+        commit_info = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True)
+        commit_hash = commit_info.stdout.strip()
+        info["Git commit"] = commit_hash
+    except Exception:
+        pass
+
+    if os.path.exists("data"):
+        info["Default data directory"] = "detected"
+    else:
+        info["Default data directory"] = "not detected"
+
+    print("\n" + "\n".join([f"- {key}: {value}" for key, value in info.items()]) + "\n")
diff --git a/LlamaFactory/src/llamafactory/extras/logging.py b/LlamaFactory/src/llamafactory/extras/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..35ff65bb6b5d93e03383ed19fb265815318e4c52
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/extras/logging.py
@@ -0,0 +1,160 @@
+# Copyright 2025 Optuna, HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/utils/logging.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from functools import lru_cache
+from typing import Optional
+
+from .constants import RUNNING_LOG
+
+
+_thread_lock = threading.RLock()
+_default_handler: Optional["logging.Handler"] = None
+_default_log_level: "logging._Level" = logging.INFO
+
+
+class LoggerHandler(logging.Handler):
+    r"""Redirect the logging output to the logging file for LLaMA Board."""
+
+    def __init__(self, output_dir: str) -> None:
+        super().__init__()
+        self._formatter = logging.Formatter(
+            fmt="[%(levelname)s|%(asctime)s] %(filename)s:%(lineno)s >> %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+        self.setLevel(logging.INFO)
+        self.thread_pool = ThreadPoolExecutor(max_workers=1)
+        os.makedirs(output_dir, exist_ok=True)
+        self.running_log = os.path.join(output_dir, RUNNING_LOG)
+        try:
+            os.remove(self.running_log)
+        except OSError:
+            pass
+
+    def _write_log(self, log_entry: str) -> None:
+        with open(self.running_log, "a", encoding="utf-8") as f:
+            f.write(log_entry + "\n")
+
+    def emit(self, record) -> None:
+        if record.name == "httpx":
+            return
+
+        log_entry = self._formatter.format(record)
+        self.thread_pool.submit(self._write_log, log_entry)
+
+    def close(self) -> None:
+        self.thread_pool.shutdown(wait=True)
+        return super().close()
+
+
+class _Logger(logging.Logger):
+    r"""A logger that supports rank0 logging."""
+
+    def info_rank0(self, *args, **kwargs) -> None:
+        self.info(*args, **kwargs)
+
+    def warning_rank0(self, *args, **kwargs) -> None:
+        self.warning(*args, **kwargs)
+
+    def warning_rank0_once(self, *args, **kwargs) -> None:
+        self.warning(*args, **kwargs)
+
+
+def _get_default_logging_level() -> "logging._Level":
+    r"""Return the default logging level."""
+    env_level_str = os.getenv("LLAMAFACTORY_VERBOSITY", None)
+    if env_level_str:
+        if env_level_str.upper() in logging._nameToLevel:
+            return logging._nameToLevel[env_level_str.upper()]
+        else:
+            raise ValueError(f"Unknown logging level: {env_level_str}.")
+
+    return _default_log_level
+
+
+def _get_library_name() -> str:
+    return __name__.split(".")[0]
+
+
+def _get_library_root_logger() -> "_Logger":
+    return logging.getLogger(_get_library_name())
+
+
+def _configure_library_root_logger() -> None:
+    r"""Configure root logger using a stdout stream handler with an explicit format."""
+    global _default_handler
+
+    with _thread_lock:
+        if _default_handler:  # already configured
+            return
+
+        formatter = logging.Formatter(
+            fmt="[%(levelname)s|%(asctime)s] %(name)s:%(lineno)s >> %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+        _default_handler = logging.StreamHandler(sys.stdout)
+        _default_handler.setFormatter(formatter)
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.addHandler(_default_handler)
+        library_root_logger.setLevel(_get_default_logging_level())
+        library_root_logger.propagate = False
+
+
+def get_logger(name: str | None = None) -> "_Logger":
+    r"""Return a logger with the specified name. It it not supposed to be accessed externally."""
+    if name is None:
+        name = _get_library_name()
+
+    _configure_library_root_logger()
+    return logging.getLogger(name)
+
+
+def add_handler(handler: "logging.Handler") -> None:
+    r"""Add a handler to the root logger."""
+    _configure_library_root_logger()
+    _get_library_root_logger().addHandler(handler)
+
+
+def remove_handler(handler: logging.Handler) -> None:
+    r"""Remove a handler to the root logger."""
+    _configure_library_root_logger()
+    _get_library_root_logger().removeHandler(handler)
+
+
+def info_rank0(self: "logging.Logger", *args, **kwargs) -> None:
+    if int(os.getenv("LOCAL_RANK", "0")) == 0:
+        self.info(*args, **kwargs)
+
+
+def warning_rank0(self: "logging.Logger", *args, **kwargs) -> None:
+    if int(os.getenv("LOCAL_RANK", "0")) == 0:
+        self.warning(*args, **kwargs)
+
+
+@lru_cache(None)
+def warning_rank0_once(self: "logging.Logger", *args, **kwargs) -> None:
+    if int(os.getenv("LOCAL_RANK", "0")) == 0:
+        self.warning(*args, **kwargs)
+
+
+logging.Logger.info_rank0 = info_rank0
+logging.Logger.warning_rank0 = warning_rank0
+logging.Logger.warning_rank0_once = warning_rank0_once
diff --git a/LlamaFactory/src/llamafactory/extras/misc.py b/LlamaFactory/src/llamafactory/extras/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c077dd57b6aef78e7b86d2bf1cfea20e2199b31
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/extras/misc.py
@@ -0,0 +1,365 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's PEFT library.
+# https://github.com/huggingface/peft/blob/v0.10.0/src/peft/peft_model.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+import socket
+from typing import TYPE_CHECKING, Any, Literal, Optional, Union
+
+import torch
+import torch.distributed as dist
+import transformers.dynamic_module_utils
+from huggingface_hub.utils import WeakFileLock
+from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList
+from transformers.dynamic_module_utils import get_relative_imports
+from transformers.utils import (
+    is_torch_bf16_gpu_available,
+    is_torch_cuda_available,
+    is_torch_mps_available,
+    is_torch_npu_available,
+    is_torch_xpu_available,
+)
+from transformers.utils.versions import require_version
+
+from . import logging
+
+
+_is_fp16_available = is_torch_npu_available() or is_torch_cuda_available()
+try:
+    _is_bf16_available = is_torch_bf16_gpu_available() or (is_torch_npu_available() and torch.npu.is_bf16_supported())
+except Exception:
+    _is_bf16_available = False
+
+
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+    from ..hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class AverageMeter:
+    r"""Compute and store the average and current value."""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def check_version(requirement: str, mandatory: bool = False) -> None:
+    r"""Optionally check the package version."""
+    if is_env_enabled("DISABLE_VERSION_CHECK") and not mandatory:
+        logger.warning_rank0_once("Version checking has been disabled, may lead to unexpected behaviors.")
+        return
+
+    if "gptmodel" in requirement or "autoawq" in requirement:
+        pip_command = f"pip install {requirement} --no-build-isolation"
+    else:
+        pip_command = f"pip install {requirement}"
+
+    if mandatory:
+        hint = f"To fix: run `{pip_command}`."
+    else:
+        hint = f"To fix: run `{pip_command}` or set `DISABLE_VERSION_CHECK=1` to skip this check."
+
+    require_version(requirement, hint)
+
+
+def check_dependencies() -> None:
+    r"""Check the version of the required packages."""
+    check_version("transformers>=4.51.0,<=5.0.0")
+    check_version("datasets>=2.16.0,<=4.0.0")
+    check_version("accelerate>=1.3.0,<=1.11.0")
+    check_version("peft>=0.18.0,<=0.18.1")
+    check_version("trl>=0.18.0,<=0.24.0")
+
+
+def calculate_tps(dataset: list[dict[str, Any]], metrics: dict[str, float], stage: Literal["sft", "rm"]) -> float:
+    r"""Calculate effective tokens per second."""
+    effective_token_num = 0
+    for data in dataset:
+        if stage == "sft":
+            effective_token_num += len(data["input_ids"])
+        elif stage == "rm":
+            effective_token_num += len(data["chosen_input_ids"]) + len(data["rejected_input_ids"])
+
+    result = effective_token_num * metrics["epoch"] / metrics["train_runtime"]
+    return result / dist.get_world_size() if dist.is_initialized() else result
+
+
+def count_parameters(model: "torch.nn.Module") -> tuple[int, int]:
+    r"""Return the number of trainable parameters and number of all parameters in the model."""
+    trainable_params, all_param = 0, 0
+    for param in model.parameters():
+        num_params = param.numel()
+        # if using DS Zero 3 and the weights are initialized empty
+        if num_params == 0 and hasattr(param, "ds_numel"):
+            num_params = param.ds_numel
+
+        # Due to the design of 4bit linear layers from bitsandbytes, multiply the number of parameters by itemsize
+        if param.__class__.__name__ == "Params4bit":
+            if hasattr(param, "quant_storage") and hasattr(param.quant_storage, "itemsize"):
+                num_bytes = param.quant_storage.itemsize
+            elif hasattr(param, "element_size"):  # for older pytorch version
+                num_bytes = param.element_size()
+            else:
+                num_bytes = 1
+
+            num_params = num_params * 2 * num_bytes
+
+        all_param += num_params
+        if param.requires_grad:
+            trainable_params += num_params
+
+    return trainable_params, all_param
+
+
+def get_current_device() -> "torch.device":
+    r"""Get the current available device."""
+    if is_torch_xpu_available():
+        device = "xpu:{}".format(os.getenv("LOCAL_RANK", "0"))
+    elif is_torch_npu_available():
+        device = "npu:{}".format(os.getenv("LOCAL_RANK", "0"))
+    elif is_torch_mps_available():
+        device = "mps:{}".format(os.getenv("LOCAL_RANK", "0"))
+    elif is_torch_cuda_available():
+        device = "cuda:{}".format(os.getenv("LOCAL_RANK", "0"))
+    else:
+        device = "cpu"
+
+    return torch.device(device)
+
+
+def get_device_name() -> str:
+    r"""Get the name of available devices."""
+    if is_torch_xpu_available():
+        device = "xpu"
+    elif is_torch_npu_available():
+        device = "npu"
+    elif is_torch_mps_available():
+        device = "mps"
+    elif is_torch_cuda_available():
+        device = "gpu"
+    else:
+        device = "cpu"
+
+    return device
+
+
+def get_torch_device():
+    r"""Get the torch device namespace for the available devices."""
+    device_name = get_device_name()
+    device_name = "cuda" if device_name == "gpu" else device_name
+    try:
+        return getattr(torch, device_name)
+    except AttributeError:
+        logger.warning_rank0(f"Device namespace '{device_name}' not found in torch, try to load torch.cuda.")
+        return torch.cuda
+
+
+def get_device_count() -> int:
+    r"""Get the number of available devices."""
+    if is_torch_xpu_available():
+        return torch.xpu.device_count()
+    elif is_torch_npu_available():
+        return torch.npu.device_count()
+    elif is_torch_mps_available():
+        return torch.mps.device_count()
+    elif is_torch_cuda_available():
+        return torch.cuda.device_count()
+    else:
+        return 0
+
+
+def get_logits_processor() -> "LogitsProcessorList":
+    r"""Get logits processor that removes NaN and Inf logits."""
+    logits_processor = LogitsProcessorList()
+    logits_processor.append(InfNanRemoveLogitsProcessor())
+    return logits_processor
+
+
+def get_current_memory() -> tuple[int, int]:
+    r"""Get the available and total memory for the current device (in Bytes)."""
+    if is_torch_xpu_available():
+        return torch.xpu.mem_get_info()
+    elif is_torch_npu_available():
+        return torch.npu.mem_get_info()
+    elif is_torch_mps_available():
+        return torch.mps.current_allocated_memory(), torch.mps.recommended_max_memory()
+    elif is_torch_cuda_available():
+        return torch.cuda.mem_get_info()
+    else:
+        return 0, -1
+
+
+def get_peak_memory() -> tuple[int, int]:
+    r"""Get the peak memory usage (allocated, reserved) for the current device (in Bytes)."""
+    if is_torch_xpu_available():
+        return torch.xpu.max_memory_allocated(), torch.xpu.max_memory_reserved()
+    elif is_torch_npu_available():
+        return torch.npu.max_memory_allocated(), torch.npu.max_memory_reserved()
+    elif is_torch_mps_available():
+        return torch.mps.current_allocated_memory(), -1
+    elif is_torch_cuda_available():
+        return torch.cuda.max_memory_allocated(), torch.cuda.max_memory_reserved()
+    else:
+        return 0, -1
+
+
+def has_tokenized_data(path: "os.PathLike") -> bool:
+    r"""Check if the path has a tokenized dataset."""
+    return os.path.isdir(path) and len(os.listdir(path)) > 0
+
+
+def infer_optim_dtype(model_dtype: Optional["torch.dtype"]) -> "torch.dtype":
+    r"""Infer the optimal dtype according to the model_dtype and device compatibility."""
+    if _is_bf16_available and (model_dtype == torch.bfloat16 or model_dtype is None):
+        return torch.bfloat16
+    elif _is_fp16_available:
+        return torch.float16
+    else:
+        return torch.float32
+
+
+def is_accelerator_available() -> bool:
+    r"""Check if the accelerator is available."""
+    return (
+        is_torch_xpu_available() or is_torch_npu_available() or is_torch_mps_available() or is_torch_cuda_available()
+    )
+
+
+def is_env_enabled(env_var: str, default: str = "0") -> bool:
+    r"""Check if the environment variable is enabled."""
+    return os.getenv(env_var, default).lower() in ["true", "y", "1"]
+
+
+def numpify(inputs: Union["NDArray", "torch.Tensor"]) -> "NDArray":
+    r"""Cast a torch tensor or a numpy array to a numpy array."""
+    if isinstance(inputs, torch.Tensor):
+        inputs = inputs.cpu()
+        if inputs.dtype == torch.bfloat16:  # numpy does not support bfloat16 until 1.21.4
+            inputs = inputs.to(torch.float32)
+
+        inputs = inputs.numpy()
+
+    return inputs
+
+
+def skip_check_imports() -> None:
+    r"""Avoid flash attention import error in custom model files."""
+    if not is_env_enabled("FORCE_CHECK_IMPORTS"):
+        transformers.dynamic_module_utils.check_imports = get_relative_imports
+
+
+def torch_gc() -> None:
+    r"""Collect the device memory."""
+    gc.collect()
+    if is_torch_xpu_available():
+        torch.xpu.empty_cache()
+    elif is_torch_npu_available():
+        torch.npu.empty_cache()
+    elif is_torch_mps_available():
+        torch.mps.empty_cache()
+    elif is_torch_cuda_available():
+        torch.cuda.empty_cache()
+
+
+def try_download_model_from_other_hub(model_args: "ModelArguments") -> str:
+    if (not use_modelscope() and not use_openmind()) or os.path.exists(model_args.model_name_or_path):
+        return model_args.model_name_or_path
+
+    if use_modelscope():
+        check_version("modelscope>=1.14.0", mandatory=True)
+        from modelscope import snapshot_download  # type: ignore
+        from modelscope.hub.api import HubApi  # type: ignore
+
+        if model_args.ms_hub_token:
+            api = HubApi()
+            api.login(model_args.ms_hub_token)
+
+        revision = "master" if model_args.model_revision == "main" else model_args.model_revision
+        with WeakFileLock(os.path.abspath(os.path.expanduser("~/.cache/llamafactory/modelscope.lock"))):
+            model_path = snapshot_download(
+                model_args.model_name_or_path,
+                revision=revision,
+                cache_dir=model_args.cache_dir,
+            )
+
+        return model_path
+
+    if use_openmind():
+        check_version("openmind>=0.8.0", mandatory=True)
+        from openmind.utils.hub import snapshot_download  # type: ignore
+
+        with WeakFileLock(os.path.abspath(os.path.expanduser("~/.cache/llamafactory/openmind.lock"))):
+            model_path = snapshot_download(
+                model_args.model_name_or_path,
+                revision=model_args.model_revision,
+                cache_dir=model_args.cache_dir,
+            )
+
+        return model_path
+
+
+def use_modelscope() -> bool:
+    return is_env_enabled("USE_MODELSCOPE_HUB")
+
+
+def use_openmind() -> bool:
+    return is_env_enabled("USE_OPENMIND_HUB")
+
+
+def use_ray() -> bool:
+    return is_env_enabled("USE_RAY")
+
+
+def use_kt() -> bool:
+    return is_env_enabled("USE_KT")
+
+
+def find_available_port() -> int:
+    r"""Find an available port on the local machine."""
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.bind(("", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    return port
+
+
+def fix_proxy(ipv6_enabled: bool = False) -> None:
+    r"""Fix proxy settings for gradio ui."""
+    os.environ["no_proxy"] = "localhost,127.0.0.1,0.0.0.0"
+    if ipv6_enabled:
+        os.environ.pop("http_proxy", None)
+        os.environ.pop("HTTP_PROXY", None)
+        os.environ.pop("https_proxy", None)
+        os.environ.pop("HTTPS_PROXY", None)
+        os.environ.pop("all_proxy", None)
+        os.environ.pop("ALL_PROXY", None)
diff --git a/LlamaFactory/src/llamafactory/extras/packages.py b/LlamaFactory/src/llamafactory/extras/packages.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6328a7b028102486d47be5d57f52690dc24c14a
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/extras/packages.py
@@ -0,0 +1,124 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/utils/import_utils.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.metadata
+import importlib.util
+from functools import lru_cache
+from typing import TYPE_CHECKING
+
+from packaging import version
+
+
+if TYPE_CHECKING:
+    from packaging.version import Version
+
+
+def _is_package_available(name: str) -> bool:
+    return importlib.util.find_spec(name) is not None
+
+
+def _get_package_version(name: str) -> "Version":
+    try:
+        return version.parse(importlib.metadata.version(name))
+    except Exception:
+        return version.parse("0.0.0")
+
+
+def is_pyav_available():
+    return _is_package_available("av")
+
+
+def is_librosa_available():
+    return _is_package_available("librosa")
+
+
+def is_fastapi_available():
+    return _is_package_available("fastapi")
+
+
+def is_galore_available():
+    return _is_package_available("galore_torch")
+
+
+def is_apollo_available():
+    return _is_package_available("apollo_torch")
+
+
+def is_jieba_available():
+    return _is_package_available("jieba")
+
+
+def is_gradio_available():
+    return _is_package_available("gradio")
+
+
+def is_matplotlib_available():
+    return _is_package_available("matplotlib")
+
+
+def is_mcore_adapter_available():
+    return _is_package_available("mcore_adapter")
+
+
+def is_pillow_available():
+    return _is_package_available("PIL")
+
+
+def is_ray_available():
+    return _is_package_available("ray")
+
+
+def is_kt_available():
+    return _is_package_available("ktransformers")
+
+
+def is_requests_available():
+    return _is_package_available("requests")
+
+
+def is_rouge_available():
+    return _is_package_available("rouge_chinese")
+
+
+def is_safetensors_available():
+    return _is_package_available("safetensors")
+
+
+def is_sglang_available():
+    return _is_package_available("sglang")
+
+
+def is_starlette_available():
+    return _is_package_available("sse_starlette")
+
+
+@lru_cache
+def is_transformers_version_greater_than(content: str):
+    return _get_package_version("transformers") >= version.parse(content)
+
+
+@lru_cache
+def is_torch_version_greater_than(content: str):
+    return _get_package_version("torch") >= version.parse(content)
+
+
+def is_uvicorn_available():
+    return _is_package_available("uvicorn")
+
+
+def is_vllm_available():
+    return _is_package_available("vllm")
diff --git a/LlamaFactory/src/llamafactory/extras/ploting.py b/LlamaFactory/src/llamafactory/extras/ploting.py
new file mode 100644
index 0000000000000000000000000000000000000000..be89bcc5cb30429b60e13e71eba465f83de90e74
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/extras/ploting.py
@@ -0,0 +1,95 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import math
+import os
+from typing import Any
+
+from transformers.trainer import TRAINER_STATE_NAME
+
+from . import logging
+from .packages import is_matplotlib_available
+
+
+if is_matplotlib_available():
+    import matplotlib.figure
+    import matplotlib.pyplot as plt
+
+
+logger = logging.get_logger(__name__)
+
+
+def smooth(scalars: list[float]) -> list[float]:
+    r"""EMA implementation according to TensorBoard."""
+    if len(scalars) == 0:
+        return []
+
+    last = scalars[0]
+    smoothed = []
+    weight = 1.8 * (1 / (1 + math.exp(-0.05 * len(scalars))) - 0.5)  # a sigmoid function
+    for next_val in scalars:
+        smoothed_val = last * weight + (1 - weight) * next_val
+        smoothed.append(smoothed_val)
+        last = smoothed_val
+    return smoothed
+
+
+def gen_loss_plot(trainer_log: list[dict[str, Any]]) -> "matplotlib.figure.Figure":
+    r"""Plot loss curves in LlamaBoard."""
+    plt.close("all")
+    plt.switch_backend("agg")
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    steps, losses = [], []
+    for log in trainer_log:
+        if log.get("loss", None):
+            steps.append(log["current_steps"])
+            losses.append(log["loss"])
+
+    ax.plot(steps, losses, color="#1f77b4", alpha=0.4, label="original")
+    ax.plot(steps, smooth(losses), color="#1f77b4", label="smoothed")
+    ax.legend()
+    ax.set_xlabel("step")
+    ax.set_ylabel("loss")
+    return fig
+
+
+def plot_loss(save_dictionary: str, keys: list[str] = ["loss"]) -> None:
+    r"""Plot loss curves and saves the image."""
+    plt.switch_backend("agg")
+    with open(os.path.join(save_dictionary, TRAINER_STATE_NAME), encoding="utf-8") as f:
+        data = json.load(f)
+
+    for key in keys:
+        steps, metrics = [], []
+        for i in range(len(data["log_history"])):
+            if key in data["log_history"][i]:
+                steps.append(data["log_history"][i]["step"])
+                metrics.append(data["log_history"][i][key])
+
+        if len(metrics) == 0:
+            logger.warning_rank0(f"No metric {key} to plot.")
+            continue
+
+        plt.figure()
+        plt.plot(steps, metrics, color="#1f77b4", alpha=0.4, label="original")
+        plt.plot(steps, smooth(metrics), color="#1f77b4", label="smoothed")
+        plt.title(f"training {key} of {save_dictionary}")
+        plt.xlabel("step")
+        plt.ylabel(key)
+        plt.legend()
+        figure_path = os.path.join(save_dictionary, "training_{}.png".format(key.replace("/", "_")))
+        plt.savefig(figure_path, format="png", dpi=100)
+        print("Figure saved at:", figure_path)
diff --git a/LlamaFactory/src/llamafactory/hparams/__init__.py b/LlamaFactory/src/llamafactory/hparams/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bcc4295ce20c431f8db209a40cfc585ae90139f
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/hparams/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .data_args import DataArguments
+from .evaluation_args import EvaluationArguments
+from .finetuning_args import FinetuningArguments
+from .generating_args import GeneratingArguments
+from .model_args import ModelArguments
+from .parser import get_eval_args, get_infer_args, get_ray_args, get_train_args, read_args
+from .training_args import RayArguments, TrainingArguments
+
+
+__all__ = [
+    "DataArguments",
+    "EvaluationArguments",
+    "FinetuningArguments",
+    "GeneratingArguments",
+    "ModelArguments",
+    "RayArguments",
+    "TrainingArguments",
+    "get_eval_args",
+    "get_infer_args",
+    "get_ray_args",
+    "get_train_args",
+    "read_args",
+]
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8beed68627fc01a43a76d25f5e46320c2bc4385
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/__init__.cpython-312.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8688495f2a487879521c032f69a5775167c02052
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/__init__.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/data_args.cpython-311.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/data_args.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f7abed5d56af6ee803f224a4786a74a25fc8e95
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/data_args.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/data_args.cpython-312.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/data_args.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f19a72736603255f9d487aa1e09a1d537af441c7
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/data_args.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/evaluation_args.cpython-311.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/evaluation_args.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e25ab4b775a49156b02f3160c38d05342c990ed1
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/evaluation_args.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/evaluation_args.cpython-312.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/evaluation_args.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9443f55e38408cd5096b2ed23207b9954913649f
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/evaluation_args.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/finetuning_args.cpython-311.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/finetuning_args.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f3c37ed9fc6d86b74cf8a969206da910cb5ad9a
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/finetuning_args.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/finetuning_args.cpython-312.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/finetuning_args.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40285510c4a3f306a9b13518364ff4fca5d6e2b8
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/finetuning_args.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/generating_args.cpython-311.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/generating_args.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..679fda66e2a0c1fb342d46c8f85ef2334e6b8b09
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/generating_args.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/generating_args.cpython-312.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/generating_args.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..563c3c76000cff1f343540cac8f222bf95049a8c
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/generating_args.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/model_args.cpython-311.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/model_args.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6103b637b787a2f07bd4b1b1bc5eb46d9085e78b
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/model_args.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/model_args.cpython-312.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/model_args.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7fc5422f87d997efd1f8916826ae2e1a83e6d95c
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/model_args.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/parser.cpython-311.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/parser.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5edb0bcbe1e6d06e777f319f3c303a2e2ceea997
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/parser.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/parser.cpython-312.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/parser.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73e0613970b9093a21600e1ff62c9d0362177063
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/parser.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/training_args.cpython-311.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/training_args.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a93250c951fa497142bf37bf1bb6648a37945d8e
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/training_args.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/__pycache__/training_args.cpython-312.pyc b/LlamaFactory/src/llamafactory/hparams/__pycache__/training_args.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd4456dcf8eb9db1fe607efafb88a9d4c8b89deb
Binary files /dev/null and b/LlamaFactory/src/llamafactory/hparams/__pycache__/training_args.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/hparams/data_args.py b/LlamaFactory/src/llamafactory/hparams/data_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..11ad513d25e6920b1f5ff48e40f1acd0862ab2be
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/hparams/data_args.py
@@ -0,0 +1,188 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import asdict, dataclass, field
+from typing import Any, Literal
+
+
+@dataclass
+class DataArguments:
+    r"""Arguments pertaining to what data we are going to input our model for training and evaluation."""
+
+    template: str | None = field(
+        default=None,
+        metadata={"help": "Which template to use for constructing prompts in training and inference."},
+    )
+    dataset: str | None = field(
+        default=None,
+        metadata={"help": "The name of dataset(s) to use for training. Use commas to separate multiple datasets."},
+    )
+    eval_dataset: str | None = field(
+        default=None,
+        metadata={"help": "The name of dataset(s) to use for evaluation. Use commas to separate multiple datasets."},
+    )
+    dataset_dir: str = field(
+        default="data",
+        metadata={"help": "Path to the folder containing the datasets."},
+    )
+    media_dir: str | None = field(
+        default=None,
+        metadata={"help": "Path to the folder containing the images, videos or audios. Defaults to `dataset_dir`."},
+    )
+    cutoff_len: int = field(
+        default=2048,
+        metadata={"help": "The cutoff length of the tokenized inputs in the dataset."},
+    )
+    train_on_prompt: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to disable the mask on the prompt."},
+    )
+    mask_history: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to mask the history and train on the last turn only."},
+    )
+    streaming: bool = field(
+        default=False,
+        metadata={"help": "Enable dataset streaming."},
+    )
+    buffer_size: int = field(
+        default=16384,
+        metadata={"help": "Size of the buffer to randomly sample examples from in dataset streaming."},
+    )
+    mix_strategy: Literal["concat", "interleave_under", "interleave_over", "interleave_once"] = field(
+        default="concat",
+        metadata={
+            "help": "Strategy to use in dataset mixing (concat/interleave) (undersampling/oversampling/sampling w.o. replacement)."
+        },
+    )
+    interleave_probs: str | None = field(
+        default=None,
+        metadata={"help": "Probabilities to sample data from datasets. Use commas to separate multiple datasets."},
+    )
+    overwrite_cache: bool = field(
+        default=False,
+        metadata={"help": "Overwrite the cached training and evaluation sets."},
+    )
+    preprocessing_batch_size: int = field(
+        default=1000,
+        metadata={"help": "The number of examples in one group in pre-processing."},
+    )
+    preprocessing_num_workers: int | None = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the pre-processing."},
+    )
+    max_samples: int | None = field(
+        default=None,
+        metadata={"help": "For debugging purposes, truncate the number of examples for each dataset."},
+    )
+    eval_num_beams: int | None = field(
+        default=None,
+        metadata={"help": "Number of beams to use for evaluation. This argument will be passed to `model.generate`"},
+    )
+    ignore_pad_token_for_loss: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to ignore the tokens corresponding to the pad label in loss computation."},
+    )
+    val_size: float = field(
+        default=0.0,
+        metadata={"help": "Size of the validation set, should be an integer or a float in range `[0,1)`."},
+    )
+    eval_on_each_dataset: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to evaluate on each dataset separately."},
+    )
+    packing: bool | None = field(
+        default=None,
+        metadata={"help": "Enable sequences packing in training. Will automatically enable in pre-training."},
+    )
+    neat_packing: bool = field(
+        default=False,
+        metadata={"help": "Enable sequence packing without cross-attention."},
+    )
+    tool_format: str | None = field(
+        default=None,
+        metadata={"help": "Tool format to use for constructing function calling examples."},
+    )
+    default_system: str | None = field(
+        default=None,
+        metadata={"help": "Override the default system message in the template."},
+    )
+    enable_thinking: bool | None = field(
+        default=True,
+        metadata={"help": "Whether or not to enable thinking mode for reasoning models."},
+    )
+    tokenized_path: str | None = field(
+        default=None,
+        metadata={
+            "help": (
+                "Path to save or load the tokenized datasets. "
+                "If tokenized_path not exists, it will save the tokenized datasets. "
+                "If tokenized_path exists, it will load the tokenized datasets."
+            )
+        },
+    )
+    data_shared_file_system: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use a shared file system for the datasets."},
+    )
+
+    def __post_init__(self):
+        def split_arg(arg):
+            if isinstance(arg, str):
+                return [item.strip() for item in arg.split(",")]
+            return arg
+
+        self.dataset = split_arg(self.dataset)
+        self.eval_dataset = split_arg(self.eval_dataset)
+
+        if self.media_dir is None:
+            self.media_dir = self.dataset_dir
+
+        if self.dataset is None and self.val_size > 1e-6:
+            raise ValueError("Cannot specify `val_size` if `dataset` is None.")
+
+        if self.eval_dataset is not None and self.val_size > 1e-6:
+            raise ValueError("Cannot specify `val_size` if `eval_dataset` is not None.")
+
+        if self.interleave_probs is not None:
+            if self.mix_strategy == "concat":
+                raise ValueError("`interleave_probs` is only valid for interleaved mixing.")
+
+            self.interleave_probs = list(map(float, split_arg(self.interleave_probs)))
+            if self.dataset is not None and len(self.dataset) != len(self.interleave_probs):
+                raise ValueError("The length of dataset and interleave probs should be identical.")
+
+            if self.eval_dataset is not None and len(self.eval_dataset) != len(self.interleave_probs):
+                raise ValueError("The length of eval dataset and interleave probs should be identical.")
+
+        if self.streaming and self.val_size > 1e-6 and self.val_size < 1:
+            raise ValueError("Streaming mode should have an integer val size.")
+
+        if self.streaming and self.max_samples is not None:
+            raise ValueError("`max_samples` is incompatible with `streaming`.")
+
+        if self.mask_history and self.train_on_prompt:
+            raise ValueError("`mask_history` is incompatible with `train_on_prompt`.")
+
+        if self.neat_packing:
+            self.packing = True
+
+        if self.packing:
+            self.cutoff_len -= 1  # avoid pad_to_multiple_of, needs improve
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
diff --git a/LlamaFactory/src/llamafactory/hparams/evaluation_args.py b/LlamaFactory/src/llamafactory/hparams/evaluation_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..eddc618ba571097431326c428e9ff3a6d4e82ec8
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/hparams/evaluation_args.py
@@ -0,0 +1,60 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from dataclasses import dataclass, field
+from typing import Literal
+
+from datasets import DownloadMode
+
+
+@dataclass
+class EvaluationArguments:
+    r"""Arguments pertaining to specify the evaluation parameters."""
+
+    task: str = field(
+        metadata={"help": "Name of the evaluation task."},
+    )
+    task_dir: str = field(
+        default="evaluation",
+        metadata={"help": "Path to the folder containing the evaluation datasets."},
+    )
+    batch_size: int = field(
+        default=4,
+        metadata={"help": "The batch size per GPU for evaluation."},
+    )
+    seed: int = field(
+        default=42,
+        metadata={"help": "Random seed to be used with data loaders."},
+    )
+    lang: Literal["en", "zh"] = field(
+        default="en",
+        metadata={"help": "Language used at evaluation."},
+    )
+    n_shot: int = field(
+        default=5,
+        metadata={"help": "Number of examplars for few-shot learning."},
+    )
+    save_dir: str | None = field(
+        default=None,
+        metadata={"help": "Path to save the evaluation results."},
+    )
+    download_mode: DownloadMode = field(
+        default=DownloadMode.REUSE_DATASET_IF_EXISTS,
+        metadata={"help": "Download mode used for the evaluation datasets."},
+    )
+
+    def __post_init__(self):
+        if self.save_dir is not None and os.path.exists(self.save_dir):
+            raise ValueError("`save_dir` already exists, use another one.")
diff --git a/LlamaFactory/src/llamafactory/hparams/finetuning_args.py b/LlamaFactory/src/llamafactory/hparams/finetuning_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..c089ca67cbe510c3b8bf8a705762734f7eff9f9b
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/hparams/finetuning_args.py
@@ -0,0 +1,594 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import asdict, dataclass, field
+from typing import Any, Literal
+
+
+@dataclass
+class FreezeArguments:
+    r"""Arguments pertaining to the freeze (partial-parameter) training."""
+
+    freeze_trainable_layers: int = field(
+        default=2,
+        metadata={
+            "help": (
+                "The number of trainable layers for freeze (partial-parameter) fine-tuning. "
+                "Positive numbers mean the last n layers are set as trainable, "
+                "negative numbers mean the first n layers are set as trainable."
+            )
+        },
+    )
+    freeze_trainable_modules: str = field(
+        default="all",
+        metadata={
+            "help": (
+                "Name(s) of trainable modules for freeze (partial-parameter) fine-tuning. "
+                "Use commas to separate multiple modules. "
+                "Use `all` to specify all the available modules."
+            )
+        },
+    )
+    freeze_extra_modules: str | None = field(
+        default=None,
+        metadata={
+            "help": (
+                "Name(s) of modules apart from hidden layers to be set as trainable "
+                "for freeze (partial-parameter) fine-tuning. "
+                "Use commas to separate multiple modules."
+            )
+        },
+    )
+
+
+@dataclass
+class LoraArguments:
+    r"""Arguments pertaining to the LoRA training."""
+
+    additional_target: str | None = field(
+        default=None,
+        metadata={
+            "help": (
+                "Name(s) of modules apart from LoRA layers to be set as trainable "
+                "and saved in the final checkpoint. "
+                "Use commas to separate multiple modules."
+            )
+        },
+    )
+    lora_alpha: int | None = field(
+        default=None,
+        metadata={"help": "The scale factor for LoRA fine-tuning (default: lora_rank * 2)."},
+    )
+    lora_dropout: float = field(
+        default=0.0,
+        metadata={"help": "Dropout rate for the LoRA fine-tuning."},
+    )
+    lora_rank: int = field(
+        default=8,
+        metadata={"help": "The intrinsic dimension for LoRA fine-tuning."},
+    )
+    lora_target: str = field(
+        default="all",
+        metadata={
+            "help": (
+                "Name(s) of target modules to apply LoRA. "
+                "Use commas to separate multiple modules. "
+                "Use `all` to specify all the linear modules."
+            )
+        },
+    )
+    loraplus_lr_ratio: float | None = field(
+        default=None,
+        metadata={"help": "LoRA plus learning rate ratio (lr_B / lr_A)."},
+    )
+    loraplus_lr_embedding: float = field(
+        default=1e-6,
+        metadata={"help": "LoRA plus learning rate for lora embedding layers."},
+    )
+    use_rslora: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the rank stabilization scaling factor for LoRA layer."},
+    )
+    use_dora: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the weight-decomposed lora method (DoRA)."},
+    )
+    pissa_init: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to initialize a PiSSA adapter."},
+    )
+    pissa_iter: int = field(
+        default=16,
+        metadata={"help": "The number of iteration steps performed by FSVD in PiSSA. Use -1 to disable it."},
+    )
+    pissa_convert: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to convert the PiSSA adapter to a normal LoRA adapter."},
+    )
+    create_new_adapter: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to create a new adapter with randomly initialized weight."},
+    )
+
+
+@dataclass
+class OFTArguments:
+    r"""Arguments pertaining to the OFT training."""
+
+    additional_target: str | None = field(
+        default=None,
+        metadata={
+            "help": (
+                "Name(s) of modules apart from LoRA layers to be set as trainable "
+                "and saved in the final checkpoint. "
+                "Use commas to separate multiple modules."
+            )
+        },
+    )
+    module_dropout: float = field(
+        default=0.0,
+        metadata={"help": "Dropout rate for the OFT fine-tuning."},
+    )
+    oft_rank: int = field(
+        default=0,
+        metadata={"help": "The intrinsic dimension for OFT fine-tuning."},
+    )
+    oft_block_size: int = field(
+        default=32,
+        metadata={"help": "The intrinsic dimension for OFT fine-tuning."},
+    )
+    oft_target: str = field(
+        default="all",
+        metadata={
+            "help": (
+                "Name(s) of target modules to apply OFT. "
+                "Use commas to separate multiple modules. "
+                "Use `all` to specify all the linear modules."
+            )
+        },
+    )
+    create_new_adapter: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to create a new adapter with randomly initialized weight."},
+    )
+
+
+@dataclass
+class RLHFArguments:
+    r"""Arguments pertaining to the PPO, DPO and KTO training."""
+
+    pref_beta: float = field(
+        default=0.1,
+        metadata={"help": "The beta parameter in the preference loss."},
+    )
+    pref_ftx: float = field(
+        default=0.0,
+        metadata={"help": "The supervised fine-tuning loss coefficient in DPO training."},
+    )
+    pref_bco_weight: float = field(
+        default=0.0,
+        metadata={"help": "The Binary Classifier Optimization coefficient in DPO training."},
+    )
+    pref_loss: Literal["sigmoid", "hinge", "ipo", "kto_pair", "orpo", "simpo"] = field(
+        default="sigmoid",
+        metadata={"help": "The type of DPO loss to use."},
+    )
+    dpo_label_smoothing: float = field(
+        default=0.0,
+        metadata={"help": "The robust DPO label smoothing parameter in cDPO that should be between 0 and 0.5."},
+    )
+    kto_chosen_weight: float = field(
+        default=1.0,
+        metadata={"help": "The weight factor of the desirable losses in KTO training."},
+    )
+    kto_rejected_weight: float = field(
+        default=1.0,
+        metadata={"help": "The weight factor of the undesirable losses in KTO training."},
+    )
+    simpo_gamma: float = field(
+        default=0.5,
+        metadata={"help": "The target reward margin term in SimPO loss."},
+    )
+    ppo_buffer_size: int = field(
+        default=1,
+        metadata={"help": "The number of mini-batches to make experience buffer in a PPO optimization step."},
+    )
+    ppo_epochs: int = field(
+        default=4,
+        metadata={"help": "The number of epochs to perform in a PPO optimization step."},
+    )
+    ppo_score_norm: bool = field(
+        default=False,
+        metadata={"help": "Use score normalization in PPO training."},
+    )
+    ppo_target: float = field(
+        default=6.0,
+        metadata={"help": "Target KL value for adaptive KL control in PPO training."},
+    )
+    ppo_whiten_rewards: bool = field(
+        default=False,
+        metadata={"help": "Whiten the rewards before compute advantages in PPO training."},
+    )
+    ref_model: str | None = field(
+        default=None,
+        metadata={"help": "Path to the reference model used for the PPO or DPO training."},
+    )
+    ref_model_adapters: str | None = field(
+        default=None,
+        metadata={"help": "Path to the adapters of the reference model."},
+    )
+    ref_model_quantization_bit: int | None = field(
+        default=None,
+        metadata={"help": "The number of bits to quantize the reference model."},
+    )
+    reward_model: str | None = field(
+        default=None,
+        metadata={"help": "Path to the reward model used for the PPO training."},
+    )
+    reward_model_adapters: str | None = field(
+        default=None,
+        metadata={"help": "Path to the adapters of the reward model."},
+    )
+    reward_model_quantization_bit: int | None = field(
+        default=None,
+        metadata={"help": "The number of bits to quantize the reward model."},
+    )
+    reward_model_type: Literal["lora", "full", "api"] = field(
+        default="lora",
+        metadata={"help": "The type of the reward model in PPO training. Lora model only supports lora training."},
+    )
+    ld_alpha: float | None = field(
+        default=None,
+        metadata={
+            "help": (
+                "Alpha parameter from the LD-DPO paper, which controls the weighting of"
+                " the verbose token log-probabilities in responses."
+            )
+        },
+    )
+
+
+@dataclass
+class GaloreArguments:
+    r"""Arguments pertaining to the GaLore algorithm."""
+
+    use_galore: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the gradient low-Rank projection (GaLore)."},
+    )
+    galore_target: str = field(
+        default="all",
+        metadata={
+            "help": (
+                "Name(s) of modules to apply GaLore. Use commas to separate multiple modules. "
+                "Use `all` to specify all the linear modules."
+            )
+        },
+    )
+    galore_rank: int = field(
+        default=16,
+        metadata={"help": "The rank of GaLore gradients."},
+    )
+    galore_update_interval: int = field(
+        default=200,
+        metadata={"help": "Number of steps to update the GaLore projection."},
+    )
+    galore_scale: float = field(
+        default=2.0,
+        metadata={"help": "GaLore scaling coefficient."},
+    )
+    galore_proj_type: Literal["std", "reverse_std", "right", "left", "full"] = field(
+        default="std",
+        metadata={"help": "Type of GaLore projection."},
+    )
+    galore_layerwise: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to enable layer-wise update to further save memory."},
+    )
+
+
+@dataclass
+class ApolloArguments:
+    r"""Arguments pertaining to the APOLLO algorithm."""
+
+    use_apollo: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the APOLLO optimizer."},
+    )
+    apollo_target: str = field(
+        default="all",
+        metadata={
+            "help": (
+                "Name(s) of modules to apply APOLLO. Use commas to separate multiple modules. "
+                "Use `all` to specify all the linear modules."
+            )
+        },
+    )
+    apollo_rank: int = field(
+        default=16,
+        metadata={"help": "The rank of APOLLO gradients."},
+    )
+    apollo_update_interval: int = field(
+        default=200,
+        metadata={"help": "Number of steps to update the APOLLO projection."},
+    )
+    apollo_scale: float = field(
+        default=32.0,
+        metadata={"help": "APOLLO scaling coefficient."},
+    )
+    apollo_proj: Literal["svd", "random"] = field(
+        default="random",
+        metadata={"help": "Type of APOLLO low-rank projection algorithm (svd or random)."},
+    )
+    apollo_proj_type: Literal["std", "right", "left"] = field(
+        default="std",
+        metadata={"help": "Type of APOLLO projection."},
+    )
+    apollo_scale_type: Literal["channel", "tensor"] = field(
+        default="channel",
+        metadata={"help": "Type of APOLLO scaling (channel or tensor)."},
+    )
+    apollo_layerwise: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to enable layer-wise update to further save memory."},
+    )
+    apollo_scale_front: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the norm-growth limiter in front of gradient scaling."},
+    )
+
+
+@dataclass
+class BAdamArgument:
+    r"""Arguments pertaining to the BAdam optimizer."""
+
+    use_badam: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the BAdam optimizer."},
+    )
+    badam_mode: Literal["layer", "ratio"] = field(
+        default="layer",
+        metadata={"help": "Whether to use layer-wise or ratio-wise BAdam optimizer."},
+    )
+    badam_start_block: int | None = field(
+        default=None,
+        metadata={"help": "The starting block index for layer-wise BAdam."},
+    )
+    badam_switch_mode: Literal["ascending", "descending", "random", "fixed"] | None = field(
+        default="ascending",
+        metadata={"help": "the strategy of picking block to update for layer-wise BAdam."},
+    )
+    badam_switch_interval: int | None = field(
+        default=50,
+        metadata={
+            "help": "Number of steps to update the block for layer-wise BAdam. Use -1 to disable the block update."
+        },
+    )
+    badam_update_ratio: float = field(
+        default=0.05,
+        metadata={"help": "The ratio of the update for ratio-wise BAdam."},
+    )
+    badam_mask_mode: Literal["adjacent", "scatter"] = field(
+        default="adjacent",
+        metadata={
+            "help": (
+                "The mode of the mask for BAdam optimizer. "
+                "`adjacent` means that the trainable parameters are adjacent to each other, "
+                "`scatter` means that trainable parameters are randomly choosed from the weight."
+            )
+        },
+    )
+    badam_verbose: int = field(
+        default=0,
+        metadata={
+            "help": (
+                "The verbosity level of BAdam optimizer. "
+                "0 for no print, 1 for print the block prefix, 2 for print trainable parameters."
+            )
+        },
+    )
+
+
+@dataclass
+class SwanLabArguments:
+    use_swanlab: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the SwanLab (an experiment tracking and visualization tool)."},
+    )
+    swanlab_project: str | None = field(
+        default="llamafactory",
+        metadata={"help": "The project name in SwanLab."},
+    )
+    swanlab_workspace: str | None = field(
+        default=None,
+        metadata={"help": "The workspace name in SwanLab."},
+    )
+    swanlab_run_name: str | None = field(
+        default=None,
+        metadata={"help": "The experiment name in SwanLab."},
+    )
+    swanlab_mode: Literal["cloud", "local"] = field(
+        default="cloud",
+        metadata={"help": "The mode of SwanLab."},
+    )
+    swanlab_api_key: str | None = field(
+        default=None,
+        metadata={"help": "The API key for SwanLab."},
+    )
+    swanlab_logdir: str | None = field(
+        default=None,
+        metadata={"help": "The log directory for SwanLab."},
+    )
+    swanlab_lark_webhook_url: str | None = field(
+        default=None,
+        metadata={"help": "The Lark(飞书) webhook URL for SwanLab."},
+    )
+    swanlab_lark_secret: str | None = field(
+        default=None,
+        metadata={"help": "The Lark(飞书) secret for SwanLab."},
+    )
+
+
+@dataclass
+class FinetuningArguments(
+    SwanLabArguments,
+    BAdamArgument,
+    ApolloArguments,
+    GaloreArguments,
+    RLHFArguments,
+    LoraArguments,
+    OFTArguments,
+    FreezeArguments,
+):
+    r"""Arguments pertaining to which techniques we are going to fine-tuning with."""
+
+    pure_bf16: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to train model in purely bf16 precision (without AMP)."},
+    )
+    stage: Literal["pt", "sft", "rm", "ppo", "dpo", "kto"] = field(
+        default="sft",
+        metadata={"help": "Which stage will be performed in training."},
+    )
+    finetuning_type: Literal["lora", "oft", "freeze", "full"] = field(
+        default="lora",
+        metadata={"help": "Which fine-tuning method to use."},
+    )
+    use_llama_pro: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to make only the parameters in the expanded blocks trainable."},
+    )
+    use_adam_mini: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the Adam-mini optimizer."},
+    )
+    use_mca: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to use MCA (Megatron Core Adapter) training. "
+                "Controlled by USE_MCA environment variable."
+            )
+        },
+    )
+    use_muon: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the Muon optimizer."},
+    )
+    use_dft_loss: bool = field(
+        default=False,
+        metadata={"help": "Whether to use the DFT loss."},
+    )
+    use_eaft_loss: bool = field(
+        default=False,
+        metadata={"help": "Whether to use the EAFT loss."},
+    )
+    eaft_alpha: float = field(
+        default=1.0,
+        metadata={"help": "The alpha parameter for EAFT loss to control the power of adaptive weight."},
+    )
+    freeze_vision_tower: bool = field(
+        default=True,
+        metadata={"help": "Whether ot not to freeze the vision tower in MLLM training."},
+    )
+    freeze_multi_modal_projector: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to freeze the multi modal projector in MLLM training."},
+    )
+    freeze_language_model: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to freeze the language model in MLLM training."},
+    )
+    compute_accuracy: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to compute the token-level accuracy at evaluation."},
+    )
+    disable_shuffling: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to disable the shuffling of the training set."},
+    )
+    early_stopping_steps: int | None = field(
+        default=None,
+        metadata={"help": "Number of steps to stop training if the `metric_for_best_model` does not improve."},
+    )
+    plot_loss: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to save the training loss curves."},
+    )
+    include_effective_tokens_per_second: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to compute effective tokens per second."},
+    )
+
+    def __post_init__(self):
+        def split_arg(arg):
+            if isinstance(arg, str):
+                return [item.strip() for item in arg.split(",")]
+            return arg
+
+        self.freeze_trainable_modules: list[str] = split_arg(self.freeze_trainable_modules)
+        self.freeze_extra_modules: list[str] | None = split_arg(self.freeze_extra_modules)
+        self.lora_alpha: int = self.lora_alpha or self.lora_rank * 2
+        self.lora_target: list[str] = split_arg(self.lora_target)
+        self.oft_target: list[str] = split_arg(self.oft_target)
+        self.additional_target: list[str] | None = split_arg(self.additional_target)
+        self.galore_target: list[str] = split_arg(self.galore_target)
+        self.apollo_target: list[str] = split_arg(self.apollo_target)
+        self.use_ref_model = self.stage == "dpo" and self.pref_loss not in ["orpo", "simpo"]
+
+        assert self.finetuning_type in ["lora", "oft", "freeze", "full"], "Invalid fine-tuning method."
+        assert self.ref_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
+        assert self.reward_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
+
+        if self.stage == "ppo" and self.reward_model is None:
+            raise ValueError("`reward_model` is necessary for PPO training.")
+
+        if self.stage == "ppo" and self.reward_model_type == "lora" and self.finetuning_type != "lora":
+            raise ValueError("`reward_model_type` cannot be lora for Freeze/Full PPO training.")
+
+        if self.stage == "ppo" and self.reward_model_type == "oft" and self.finetuning_type != "oft":
+            raise ValueError("`reward_model_type` cannot be oft for Freeze/Full PPO training.")
+
+        if self.stage == "dpo" and self.pref_loss != "sigmoid" and self.dpo_label_smoothing > 1e-6:
+            raise ValueError("`dpo_label_smoothing` is only valid for sigmoid loss function.")
+
+        if self.use_llama_pro and self.finetuning_type == "full":
+            raise ValueError("`use_llama_pro` is only valid for Freeze or LoRA training.")
+
+        if self.finetuning_type == "lora" and (self.use_galore or self.use_apollo or self.use_badam):
+            raise ValueError("Cannot use LoRA with GaLore, APOLLO or BAdam together.")
+
+        if int(self.use_galore) + int(self.use_apollo) + (self.use_badam) > 1:
+            raise ValueError("Cannot use GaLore, APOLLO or BAdam together.")
+
+        if self.pissa_init and (self.stage in ["ppo", "kto"] or self.use_ref_model):
+            raise ValueError("Cannot use PiSSA for current training stage.")
+
+        if self.finetuning_type != "lora":
+            if self.loraplus_lr_ratio is not None:
+                raise ValueError("`loraplus_lr_ratio` is only valid for LoRA training.")
+
+            if self.use_rslora:
+                raise ValueError("`use_rslora` is only valid for LoRA training.")
+
+            if self.use_dora:
+                raise ValueError("`use_dora` is only valid for LoRA training.")
+
+            if self.pissa_init:
+                raise ValueError("`pissa_init` is only valid for LoRA training.")
+
+    def to_dict(self) -> dict[str, Any]:
+        args = asdict(self)
+        args = {k: f"<{k.upper()}>" if k.endswith("api_key") else v for k, v in args.items()}
+        return args
diff --git a/LlamaFactory/src/llamafactory/hparams/generating_args.py b/LlamaFactory/src/llamafactory/hparams/generating_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eacb14763ac972a08ea59b74a799e704e4f9bc9
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/hparams/generating_args.py
@@ -0,0 +1,83 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import asdict, dataclass, field
+from typing import Any
+
+from transformers import GenerationConfig
+
+
+@dataclass
+class GeneratingArguments:
+    r"""Arguments pertaining to specify the decoding parameters."""
+
+    do_sample: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use sampling, use greedy decoding otherwise."},
+    )
+    temperature: float = field(
+        default=0.95,
+        metadata={"help": "The value used to modulate the next token probabilities."},
+    )
+    top_p: float = field(
+        default=0.7,
+        metadata={
+            "help": (
+                "The smallest set of most probable tokens with probabilities that add up to top_p or higher are kept."
+            )
+        },
+    )
+    top_k: int = field(
+        default=50,
+        metadata={"help": "The number of highest probability vocabulary tokens to keep for top-k filtering."},
+    )
+    num_beams: int = field(
+        default=1,
+        metadata={"help": "Number of beams for beam search. 1 means no beam search."},
+    )
+    max_length: int = field(
+        default=1024,
+        metadata={"help": "The maximum length the generated tokens can have. It can be overridden by max_new_tokens."},
+    )
+    max_new_tokens: int = field(
+        default=1024,
+        metadata={"help": "The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt."},
+    )
+    repetition_penalty: float = field(
+        default=1.0,
+        metadata={"help": "The parameter for repetition penalty. 1.0 means no penalty."},
+    )
+    length_penalty: float = field(
+        default=1.0,
+        metadata={"help": "Exponential penalty to the length that is used with beam-based generation."},
+    )
+    skip_special_tokens: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to remove special tokens in the decoding."},
+    )
+
+    def to_dict(self, obey_generation_config: bool = False) -> dict[str, Any]:
+        args = asdict(self)
+        if args.get("max_new_tokens", -1) > 0:
+            args.pop("max_length", None)
+        else:
+            args.pop("max_new_tokens", None)
+
+        if obey_generation_config:
+            generation_config = GenerationConfig()
+            for key in list(args.keys()):
+                if not hasattr(generation_config, key):
+                    args.pop(key)
+
+        return args
diff --git a/LlamaFactory/src/llamafactory/hparams/model_args.py b/LlamaFactory/src/llamafactory/hparams/model_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bfaa27344b3b2c1071a4010c83d28a3832e33e6
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/hparams/model_args.py
@@ -0,0 +1,568 @@
+# Copyright 2025 HuggingFace Inc., the KVCache.AI team, Approaching AI, and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from dataclasses import asdict, dataclass, field, fields
+from typing import Any, Literal, Self
+
+import torch
+from omegaconf import OmegaConf
+from transformers.training_args import _convert_str_dict
+
+from ..extras.constants import AttentionFunction, EngineName, QuantizationMethod, RopeScaling
+from ..extras.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class BaseModelArguments:
+    r"""Arguments pertaining to the model."""
+
+    model_name_or_path: str | None = field(
+        default=None,
+        metadata={
+            "help": "Path to the model weight or identifier from huggingface.co/models or modelscope.cn/models."
+        },
+    )
+    adapter_name_or_path: str | None = field(
+        default=None,
+        metadata={
+            "help": (
+                "Path to the adapter weight or identifier from huggingface.co/models. "
+                "Use commas to separate multiple adapters."
+            )
+        },
+    )
+    adapter_folder: str | None = field(
+        default=None,
+        metadata={"help": "The folder containing the adapter weights to load."},
+    )
+    cache_dir: str | None = field(
+        default=None,
+        metadata={"help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use one of the fast tokenizer (backed by the tokenizers library)."},
+    )
+    resize_vocab: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to resize the tokenizer vocab and the embedding layers."},
+    )
+    split_special_tokens: bool = field(
+        default=False,
+        metadata={"help": "Whether or not the special tokens should be split during the tokenization process."},
+    )
+    add_tokens: str | None = field(
+        default=None,
+        metadata={
+            "help": "Non-special tokens to be added into the tokenizer. Use commas to separate multiple tokens."
+        },
+    )
+    add_special_tokens: str | None = field(
+        default=None,
+        metadata={"help": "Special tokens to be added into the tokenizer. Use commas to separate multiple tokens."},
+    )
+    new_special_tokens_config: str | None = field(
+        default=None,
+        metadata={
+            "help": (
+                "Path to YAML config with special token descriptions for semantic initialization. "
+                "If set, this takes precedence over add_special_tokens. "
+                "YAML format: {'<token>': 'description text', ...}"
+            )
+        },
+    )
+    init_special_tokens: Literal["noise_init", "desc_init", "desc_init_w_noise"] = field(
+        default="noise_init",
+        metadata={
+            "help": (
+                "Initialization method for new special tokens: "
+                "'noise_init' (default, random noise around mean), "
+                "'desc_init' (semantic initialization from descriptions), "
+                "'desc_init_w_noise' (semantic + random noise). "
+                "Note: 'desc_init' methods require new_special_tokens_config."
+            )
+        },
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    low_cpu_mem_usage: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use memory-efficient model loading."},
+    )
+    rope_scaling: RopeScaling | None = field(
+        default=None,
+        metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."},
+    )
+    flash_attn: AttentionFunction = field(
+        default=AttentionFunction.AUTO,
+        metadata={"help": "Enable FlashAttention for faster training and inference."},
+    )
+    shift_attn: bool = field(
+        default=False,
+        metadata={"help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."},
+    )
+    mixture_of_depths: Literal["convert", "load"] | None = field(
+        default=None,
+        metadata={"help": "Convert the model to mixture-of-depths (MoD) or load the MoD model."},
+    )
+    use_unsloth: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use unsloth's optimization for the LoRA training."},
+    )
+    use_unsloth_gc: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use unsloth's gradient checkpointing (no need to install unsloth)."},
+    )
+    enable_liger_kernel: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to enable liger kernel for faster training."},
+    )
+    moe_aux_loss_coef: float | None = field(
+        default=None,
+        metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."},
+    )
+    disable_gradient_checkpointing: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to disable gradient checkpointing."},
+    )
+    use_reentrant_gc: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use reentrant gradient checkpointing."},
+    )
+    upcast_layernorm: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to upcast the layernorm weights in fp32."},
+    )
+    upcast_lmhead_output: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to upcast the output of lm_head in fp32."},
+    )
+    train_from_scratch: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to randomly initialize the model weights."},
+    )
+    infer_backend: EngineName = field(
+        default=EngineName.HF,
+        metadata={"help": "Backend engine used at inference."},
+    )
+    offload_folder: str = field(
+        default="offload",
+        metadata={"help": "Path to offload model weights."},
+    )
+    use_kv_cache: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use KV cache in generation."},
+    )
+    use_v1_kernels: bool | None = field(
+        default=False,
+        metadata={"help": "Whether or not to use high-performance kernels in training."},
+    )
+    infer_dtype: Literal["auto", "float16", "bfloat16", "float32"] = field(
+        default="auto",
+        metadata={"help": "Data type for model weights and activations at inference."},
+    )
+    hf_hub_token: str | None = field(
+        default=None,
+        metadata={"help": "Auth token to log in with Hugging Face Hub."},
+    )
+    ms_hub_token: str | None = field(
+        default=None,
+        metadata={"help": "Auth token to log in with ModelScope Hub."},
+    )
+    om_hub_token: str | None = field(
+        default=None,
+        metadata={"help": "Auth token to log in with Modelers Hub."},
+    )
+    print_param_status: bool = field(
+        default=False,
+        metadata={"help": "For debugging purposes, print the status of the parameters in the model."},
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={"help": "Whether to trust the execution of code from datasets/models defined on the Hub or not."},
+    )
+
+    def __post_init__(self):
+        if self.model_name_or_path is None:
+            raise ValueError("Please provide `model_name_or_path`.")
+
+        if self.adapter_name_or_path is not None:  # support merging multiple lora weights
+            self.adapter_name_or_path = [path.strip() for path in self.adapter_name_or_path.split(",")]
+
+        if self.add_tokens is not None:  # support multiple tokens
+            self.add_tokens = [token.strip() for token in self.add_tokens.split(",")]
+
+        # Process special tokens with priority: new_special_tokens_config > add_special_tokens
+        if self.new_special_tokens_config is not None:
+            # Priority 1: Load from YAML config (extracts both tokens and descriptions)
+            try:
+                cfg = OmegaConf.load(self.new_special_tokens_config)
+                token_descriptions = OmegaConf.to_container(cfg)
+
+                if not isinstance(token_descriptions, dict):
+                    raise ValueError(
+                        f"YAML config must be a dictionary mapping tokens to descriptions. "
+                        f"Got: {type(token_descriptions)}"
+                    )
+
+                # Extract token list from config keys
+                extracted_tokens = list(token_descriptions.keys())
+
+                # Warn if both are set
+                if self.add_special_tokens is not None:
+                    logger.warning_rank0(
+                        "Both 'new_special_tokens_config' and 'add_special_tokens' are set. "
+                        f"Using tokens from config: {extracted_tokens}"
+                    )
+
+                # Override add_special_tokens with extracted tokens (as list)
+                self.add_special_tokens = extracted_tokens
+
+                # Store descriptions internally for later use (internal attribute)
+                self._special_token_descriptions = token_descriptions
+
+                logger.info_rank0(
+                    f"Loaded {len(extracted_tokens)} special tokens with descriptions from: "
+                    f"{self.new_special_tokens_config}"
+                )
+
+            except Exception as e:
+                logger.error_rank0(
+                    f"Failed to load special tokens config from '{self.new_special_tokens_config}': {e}"
+                )
+                raise
+
+        elif self.add_special_tokens is not None:
+            # Priority 2: Use simple comma-separated string (no descriptions)
+            self.add_special_tokens = [token.strip() for token in self.add_special_tokens.split(",")]
+            self._special_token_descriptions = None
+
+        else:
+            # No special tokens to add
+            self._special_token_descriptions = None
+
+        # Validate init method
+        if self.init_special_tokens in ["desc_init", "desc_init_w_noise"]:
+            if self._special_token_descriptions is None:
+                logger.warning_rank0(
+                    f"init_special_tokens='{self.init_special_tokens}' requires new_special_tokens_config. "
+                    "Falling back to 'noise_init'"
+                )
+                self.init_special_tokens = "noise_init"
+
+
+@dataclass
+class QuantizationArguments:
+    r"""Arguments pertaining to the quantization method."""
+
+    quantization_method: QuantizationMethod = field(
+        default=QuantizationMethod.BNB,
+        metadata={"help": "Quantization method to use for on-the-fly quantization."},
+    )
+    quantization_bit: int | None = field(
+        default=None,
+        metadata={"help": "The number of bits to quantize the model using on-the-fly quantization."},
+    )
+    quantization_type: Literal["fp4", "nf4"] = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use in bitsandbytes int4 training."},
+    )
+    double_quantization: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use double quantization in bitsandbytes int4 training."},
+    )
+    quantization_device_map: Literal["auto"] | None = field(
+        default=None,
+        metadata={"help": "Device map used to infer the 4-bit quantized model, needs bitsandbytes>=0.43.0."},
+    )
+
+
+@dataclass
+class ProcessorArguments:
+    r"""Arguments pertaining to the image processor."""
+
+    image_max_pixels: int = field(
+        default=768 * 768,
+        metadata={"help": "The maximum number of pixels of image inputs."},
+    )
+    image_min_pixels: int = field(
+        default=32 * 32,
+        metadata={"help": "The minimum number of pixels of image inputs."},
+    )
+    image_do_pan_and_scan: bool = field(
+        default=False,
+        metadata={"help": "Use pan and scan to process image for gemma3."},
+    )
+    crop_to_patches: bool = field(
+        default=False,
+        metadata={"help": "Whether to crop the image to patches for internvl."},
+    )
+    video_max_pixels: int = field(
+        default=256 * 256,
+        metadata={"help": "The maximum number of pixels of video inputs."},
+    )
+    video_min_pixels: int = field(
+        default=16 * 16,
+        metadata={"help": "The minimum number of pixels of video inputs."},
+    )
+    video_fps: float = field(
+        default=2.0,
+        metadata={"help": "The frames to sample per second for video inputs."},
+    )
+    video_maxlen: int = field(
+        default=128,
+        metadata={"help": "The maximum number of sampled frames for video inputs."},
+    )
+    use_audio_in_video: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use audio in video inputs."},
+    )
+    audio_sampling_rate: int = field(
+        default=16000,
+        metadata={"help": "The sampling rate of audio inputs."},
+    )
+
+    def __post_init__(self):
+        if self.image_max_pixels < self.image_min_pixels:
+            raise ValueError("`image_max_pixels` cannot be smaller than `image_min_pixels`.")
+
+        if self.video_max_pixels < self.video_min_pixels:
+            raise ValueError("`video_max_pixels` cannot be smaller than `video_min_pixels`.")
+
+
+@dataclass
+class ExportArguments:
+    r"""Arguments pertaining to the model export."""
+
+    export_dir: str | None = field(
+        default=None,
+        metadata={"help": "Path to the directory to save the exported model."},
+    )
+    export_size: int = field(
+        default=5,
+        metadata={"help": "The file shard size (in GB) of the exported model."},
+    )
+    export_device: Literal["cpu", "auto"] = field(
+        default="cpu",
+        metadata={"help": "The device used in model export, use `auto` to accelerate exporting."},
+    )
+    export_quantization_bit: int | None = field(
+        default=None,
+        metadata={"help": "The number of bits to quantize the exported model."},
+    )
+    export_quantization_dataset: str | None = field(
+        default=None,
+        metadata={"help": "Path to the dataset or dataset name to use in quantizing the exported model."},
+    )
+    export_quantization_nsamples: int = field(
+        default=128,
+        metadata={"help": "The number of samples used for quantization."},
+    )
+    export_quantization_maxlen: int = field(
+        default=1024,
+        metadata={"help": "The maximum length of the model inputs used for quantization."},
+    )
+    export_legacy_format: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to save the `.bin` files instead of `.safetensors`."},
+    )
+    export_hub_model_id: str | None = field(
+        default=None,
+        metadata={"help": "The name of the repository if push the model to the Hugging Face hub."},
+    )
+
+    def __post_init__(self):
+        if self.export_quantization_bit is not None and self.export_quantization_dataset is None:
+            raise ValueError("Quantization dataset is necessary for exporting.")
+
+
+@dataclass
+class VllmArguments:
+    r"""Arguments pertaining to the vLLM worker."""
+
+    vllm_maxlen: int = field(
+        default=4096,
+        metadata={"help": "Maximum sequence (prompt + response) length of the vLLM engine."},
+    )
+    vllm_gpu_util: float = field(
+        default=0.7,
+        metadata={"help": "The fraction of GPU memory in (0,1) to be used for the vLLM engine."},
+    )
+    vllm_enforce_eager: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to disable CUDA graph in the vLLM engine."},
+    )
+    vllm_max_lora_rank: int = field(
+        default=32,
+        metadata={"help": "Maximum rank of all LoRAs in the vLLM engine."},
+    )
+    vllm_config: dict | str | None = field(
+        default=None,
+        metadata={"help": "Config to initialize the vllm engine. Please use JSON strings."},
+    )
+
+    def __post_init__(self):
+        if isinstance(self.vllm_config, str) and self.vllm_config.startswith("{"):
+            self.vllm_config = _convert_str_dict(json.loads(self.vllm_config))
+
+
+@dataclass
+class SGLangArguments:
+    r"""Arguments pertaining to the SGLang worker."""
+
+    sglang_maxlen: int = field(
+        default=4096,
+        metadata={"help": "Maximum sequence (prompt + response) length of the SGLang engine."},
+    )
+    sglang_mem_fraction: float = field(
+        default=0.7,
+        metadata={"help": "The memory fraction (0-1) to be used for the SGLang engine."},
+    )
+    sglang_tp_size: int = field(
+        default=-1,
+        metadata={"help": "Tensor parallel size for the SGLang engine."},
+    )
+    sglang_config: dict | str | None = field(
+        default=None,
+        metadata={"help": "Config to initialize the SGLang engine. Please use JSON strings."},
+    )
+    sglang_lora_backend: Literal["triton", "flashinfer"] = field(
+        default="triton",
+        metadata={
+            "help": "The backend of running GEMM kernels for Lora modules. Recommend using the Triton LoRA backend for better performance and stability."
+        },
+    )
+
+    def __post_init__(self):
+        if isinstance(self.sglang_config, str) and self.sglang_config.startswith("{"):
+            self.sglang_config = _convert_str_dict(json.loads(self.sglang_config))
+
+
+@dataclass
+class KTransformersArguments:
+    r"""Arguments pertaining to the KT training."""
+
+    use_kt: bool = field(
+        default=False,
+        metadata={"help": "Whether To Use KTransformers Optimizations For LoRA Training."},
+    )
+    kt_optimize_rule: str | None = field(
+        default=None,
+        metadata={
+            "help": "Path To The KTransformers Optimize Rule; See https://github.com/kvcache-ai/ktransformers/."
+        },
+    )
+    cpu_infer: int | None = field(
+        default=32,
+        metadata={"help": "Number Of CPU Cores Used For Computation."},
+    )
+    chunk_size: int | None = field(
+        default=8192,
+        metadata={"help": "Chunk Size Used For CPU Compute In KTransformers."},
+    )
+    mode: str | None = field(
+        default="normal",
+        metadata={"help": "Normal Or Long_Context For Llama Models."},
+    )
+
+    kt_maxlen: int = field(
+        default=4096,
+        metadata={"help": "Maximum Sequence (Prompt + Response) Length Of The KT Engine."},
+    )
+    kt_use_cuda_graph: bool = field(
+        default=True,
+        metadata={"help": "Whether To Use CUDA Graphs For The KT Engine."},
+    )
+    kt_mode: str = field(
+        default="normal",
+        metadata={"help": "Normal Or Long_Context Mode For The KT Engine."},
+    )
+    kt_force_think: bool = field(
+        default=False,
+        metadata={"help": "Force-Think Toggle For The KT Engine."},
+    )
+
+
+@dataclass
+class ModelArguments(
+    SGLangArguments,
+    VllmArguments,
+    KTransformersArguments,
+    ExportArguments,
+    ProcessorArguments,
+    QuantizationArguments,
+    BaseModelArguments,
+):
+    r"""Arguments pertaining to which model/config/tokenizer we are going to fine-tune or infer.
+
+    The class on the most right will be displayed first.
+    """
+
+    compute_dtype: torch.dtype | None = field(
+        default=None,
+        init=False,
+        metadata={"help": "Torch data type for computing model outputs, derived from `fp/bf16`. Do not specify it."},
+    )
+    device_map: str | dict[str, Any] | None = field(
+        default=None,
+        init=False,
+        metadata={"help": "Device map for model placement, derived from training stage. Do not specify it."},
+    )
+    model_max_length: int | None = field(
+        default=None,
+        init=False,
+        metadata={"help": "The maximum input length for model, derived from `cutoff_len`. Do not specify it."},
+    )
+    block_diag_attn: bool = field(
+        default=False,
+        init=False,
+        metadata={"help": "Whether use block diag attention or not, derived from `neat_packing`. Do not specify it."},
+    )
+
+    def __post_init__(self):
+        BaseModelArguments.__post_init__(self)
+        ProcessorArguments.__post_init__(self)
+        ExportArguments.__post_init__(self)
+        VllmArguments.__post_init__(self)
+        SGLangArguments.__post_init__(self)
+
+    @classmethod
+    def copyfrom(cls, source: "Self", **kwargs) -> "Self":
+        init_args, lazy_args = {}, {}
+        for attr in fields(source):
+            if attr.init:
+                init_args[attr.name] = getattr(source, attr.name)
+            else:
+                lazy_args[attr.name] = getattr(source, attr.name)
+
+        init_args.update(kwargs)
+        result = cls(**init_args)
+        for name, value in lazy_args.items():
+            setattr(result, name, value)
+
+        return result
+
+    def to_dict(self) -> dict[str, Any]:
+        args = asdict(self)
+        args = {k: f"<{k.upper()}>" if k.endswith("token") else v for k, v in args.items()}
+        return args
diff --git a/LlamaFactory/src/llamafactory/hparams/parser.py b/LlamaFactory/src/llamafactory/hparams/parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..81762635bba758a2f9184f858a8bd6caa5eb0afa
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/hparams/parser.py
@@ -0,0 +1,523 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Any, Optional
+
+import torch
+import transformers
+from omegaconf import OmegaConf
+from transformers import HfArgumentParser
+from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.training_args import ParallelMode
+from transformers.utils import is_torch_bf16_gpu_available, is_torch_npu_available
+
+from ..extras import logging
+from ..extras.constants import CHECKPOINT_NAMES, EngineName
+from ..extras.misc import check_dependencies, check_version, get_current_device, is_env_enabled
+from ..extras.packages import is_mcore_adapter_available, is_transformers_version_greater_than
+from .data_args import DataArguments
+from .evaluation_args import EvaluationArguments
+from .finetuning_args import FinetuningArguments
+from .generating_args import GeneratingArguments
+from .model_args import ModelArguments
+from .training_args import RayArguments, TrainingArguments
+
+
+logger = logging.get_logger(__name__)
+
+check_dependencies()
+
+
+_TRAIN_ARGS = [ModelArguments, DataArguments, TrainingArguments, FinetuningArguments, GeneratingArguments]
+_TRAIN_CLS = tuple[ModelArguments, DataArguments, TrainingArguments, FinetuningArguments, GeneratingArguments]
+_INFER_ARGS = [ModelArguments, DataArguments, FinetuningArguments, GeneratingArguments]
+_INFER_CLS = tuple[ModelArguments, DataArguments, FinetuningArguments, GeneratingArguments]
+_EVAL_ARGS = [ModelArguments, DataArguments, EvaluationArguments, FinetuningArguments]
+_EVAL_CLS = tuple[ModelArguments, DataArguments, EvaluationArguments, FinetuningArguments]
+
+if is_mcore_adapter_available() and is_env_enabled("USE_MCA"):
+    from mcore_adapter import TrainingArguments as McaTrainingArguments
+
+    _TRAIN_MCA_ARGS = [ModelArguments, DataArguments, McaTrainingArguments, FinetuningArguments, GeneratingArguments]
+    _TRAIN_MCA_CLS = tuple[
+        ModelArguments, DataArguments, McaTrainingArguments, FinetuningArguments, GeneratingArguments
+    ]
+else:
+    _TRAIN_MCA_ARGS = []
+    _TRAIN_MCA_CLS = tuple()
+
+
+def read_args(args: dict[str, Any] | list[str] | None = None) -> dict[str, Any] | list[str]:
+    r"""Get arguments from the command line or a config file."""
+    if args is not None:
+        return args
+
+    if len(sys.argv) > 1 and (sys.argv[1].endswith(".yaml") or sys.argv[1].endswith(".yml")):
+        override_config = OmegaConf.from_cli(sys.argv[2:])
+        dict_config = OmegaConf.load(Path(sys.argv[1]).absolute())
+        return OmegaConf.to_container(OmegaConf.merge(dict_config, override_config))
+    elif len(sys.argv) > 1 and sys.argv[1].endswith(".json"):
+        override_config = OmegaConf.from_cli(sys.argv[2:])
+        dict_config = OmegaConf.create(json.load(Path(sys.argv[1]).absolute()))
+        return OmegaConf.to_container(OmegaConf.merge(dict_config, override_config))
+    else:
+        return sys.argv[1:]
+
+
+def _parse_args(
+    parser: "HfArgumentParser", args: dict[str, Any] | list[str] | None = None, allow_extra_keys: bool = False
+) -> tuple[Any]:
+    args = read_args(args)
+    if isinstance(args, dict):
+        return parser.parse_dict(args, allow_extra_keys=allow_extra_keys)
+
+    (*parsed_args, unknown_args) = parser.parse_args_into_dataclasses(args=args, return_remaining_strings=True)
+
+    if unknown_args and not allow_extra_keys:
+        print(parser.format_help())
+        print(f"Got unknown args, potentially deprecated arguments: {unknown_args}")
+        raise ValueError(f"Some specified arguments are not used by the HfArgumentParser: {unknown_args}")
+
+    return tuple(parsed_args)
+
+
+def _set_transformers_logging() -> None:
+    if os.getenv("LLAMAFACTORY_VERBOSITY", "INFO") in ["DEBUG", "INFO"]:
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+
+
+def _set_env_vars() -> None:
+    if is_torch_npu_available():
+        # avoid JIT compile on NPU devices, see https://zhuanlan.zhihu.com/p/660875458
+        torch.npu.set_compile_mode(jit_compile=is_env_enabled("NPU_JIT_COMPILE"))
+        # avoid use fork method on NPU devices, see https://github.com/hiyouga/LLaMA-Factory/issues/7447
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+def _verify_model_args(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    finetuning_args: "FinetuningArguments",
+) -> None:
+    if model_args.adapter_name_or_path is not None and finetuning_args.finetuning_type != "lora":
+        raise ValueError("Adapter is only valid for the LoRA method.")
+
+    if model_args.quantization_bit is not None:
+        if finetuning_args.finetuning_type not in ["lora", "oft"]:
+            raise ValueError("Quantization is only compatible with the LoRA or OFT method.")
+
+        if finetuning_args.pissa_init:
+            raise ValueError("Please use scripts/pissa_init.py to initialize PiSSA for a quantized model.")
+
+        if model_args.resize_vocab:
+            raise ValueError("Cannot resize embedding layers of a quantized model.")
+
+        if model_args.adapter_name_or_path is not None and finetuning_args.create_new_adapter:
+            raise ValueError("Cannot create new adapter upon a quantized model.")
+
+        if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1:
+            raise ValueError("Quantized model only accepts a single adapter. Merge them first.")
+
+
+def _check_extra_dependencies(
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+    training_args: Optional["TrainingArguments"] = None,
+) -> None:
+    if model_args.use_kt:
+        check_version("ktransformers", mandatory=True)
+
+    if model_args.use_unsloth:
+        check_version("unsloth", mandatory=True)
+
+    if model_args.enable_liger_kernel:
+        check_version("liger-kernel", mandatory=True)
+
+    if model_args.mixture_of_depths is not None:
+        check_version("mixture-of-depth>=1.1.6", mandatory=True)
+
+    if model_args.infer_backend == EngineName.VLLM:
+        check_version("vllm>=0.4.3,<=0.11.0")
+        check_version("vllm", mandatory=True)
+    elif model_args.infer_backend == EngineName.SGLANG:
+        check_version("sglang>=0.4.5")
+        check_version("sglang", mandatory=True)
+
+    if finetuning_args.use_galore:
+        check_version("galore_torch", mandatory=True)
+
+    if finetuning_args.use_apollo:
+        check_version("apollo_torch", mandatory=True)
+
+    if finetuning_args.use_badam:
+        check_version("badam>=1.2.1", mandatory=True)
+
+    if finetuning_args.use_adam_mini:
+        check_version("adam-mini", mandatory=True)
+
+    if finetuning_args.use_swanlab:
+        check_version("swanlab", mandatory=True)
+
+    if finetuning_args.plot_loss:
+        check_version("matplotlib", mandatory=True)
+
+    if training_args is not None:
+        if training_args.deepspeed:
+            check_version("deepspeed", mandatory=True)
+
+        if training_args.predict_with_generate:
+            check_version("jieba", mandatory=True)
+            check_version("nltk", mandatory=True)
+            check_version("rouge_chinese", mandatory=True)
+
+
+def _parse_train_args(args: dict[str, Any] | list[str] | None = None) -> _TRAIN_CLS:
+    parser = HfArgumentParser(_TRAIN_ARGS)
+    allow_extra_keys = is_env_enabled("ALLOW_EXTRA_ARGS")
+    return _parse_args(parser, args, allow_extra_keys=allow_extra_keys)
+
+
+def _parse_train_mca_args(args: dict[str, Any] | list[str] | None = None) -> _TRAIN_MCA_CLS:
+    parser = HfArgumentParser(_TRAIN_MCA_ARGS)
+    allow_extra_keys = is_env_enabled("ALLOW_EXTRA_ARGS")
+    model_args, data_args, training_args, finetuning_args, generating_args = _parse_args(
+        parser, args, allow_extra_keys=allow_extra_keys
+    )
+
+    _configure_mca_training_args(training_args, data_args, finetuning_args)
+
+    return model_args, data_args, training_args, finetuning_args, generating_args
+
+
+def _configure_mca_training_args(training_args, data_args, finetuning_args) -> None:
+    """Patch training args to avoid args checking errors and sync MCA settings."""
+    training_args.predict_with_generate = False
+    training_args.generation_max_length = data_args.cutoff_len
+    training_args.generation_num_beams = 1
+    training_args.use_mca = True
+    finetuning_args.use_mca = True
+
+
+def _parse_infer_args(args: dict[str, Any] | list[str] | None = None) -> _INFER_CLS:
+    parser = HfArgumentParser(_INFER_ARGS)
+    allow_extra_keys = is_env_enabled("ALLOW_EXTRA_ARGS")
+    return _parse_args(parser, args, allow_extra_keys=allow_extra_keys)
+
+
+def _parse_eval_args(args: dict[str, Any] | list[str] | None = None) -> _EVAL_CLS:
+    parser = HfArgumentParser(_EVAL_ARGS)
+    allow_extra_keys = is_env_enabled("ALLOW_EXTRA_ARGS")
+    return _parse_args(parser, args, allow_extra_keys=allow_extra_keys)
+
+
+def get_ray_args(args: dict[str, Any] | list[str] | None = None) -> RayArguments:
+    parser = HfArgumentParser(RayArguments)
+    (ray_args,) = _parse_args(parser, args, allow_extra_keys=True)
+    return ray_args
+
+
+def get_train_args(args: dict[str, Any] | list[str] | None = None) -> _TRAIN_CLS:
+    if is_env_enabled("USE_MCA"):
+        model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_mca_args(args)
+    else:
+        model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
+        finetuning_args.use_mca = False
+
+    # Setup logging
+    if training_args.should_log:
+        _set_transformers_logging()
+
+    # Check arguments
+    if finetuning_args.stage != "sft":
+        if training_args.predict_with_generate:
+            raise ValueError("`predict_with_generate` cannot be set as True except SFT.")
+
+        if data_args.neat_packing:
+            raise ValueError("`neat_packing` cannot be set as True except SFT.")
+
+        if data_args.train_on_prompt or data_args.mask_history:
+            raise ValueError("`train_on_prompt` or `mask_history` cannot be set as True except SFT.")
+
+    if finetuning_args.stage == "sft" and training_args.do_predict and not training_args.predict_with_generate:
+        raise ValueError("Please enable `predict_with_generate` to save model predictions.")
+
+    if finetuning_args.stage in ["rm", "ppo"] and training_args.load_best_model_at_end:
+        raise ValueError("RM and PPO stages do not support `load_best_model_at_end`.")
+
+    if finetuning_args.stage == "ppo":
+        if not training_args.do_train:
+            raise ValueError("PPO training does not support evaluation, use the SFT stage to evaluate models.")
+
+        if model_args.shift_attn:
+            raise ValueError("PPO training is incompatible with S^2-Attn.")
+
+        if finetuning_args.reward_model_type == "lora" and model_args.use_kt:
+            raise ValueError("KTransformers does not support lora reward model.")
+
+        if finetuning_args.reward_model_type == "lora" and model_args.use_unsloth:
+            raise ValueError("Unsloth does not support lora reward model.")
+
+        if training_args.report_to and training_args.report_to[0] not in ["wandb", "tensorboard"]:
+            raise ValueError("PPO only accepts wandb or tensorboard logger.")
+
+    if not model_args.use_kt and training_args.parallel_mode == ParallelMode.NOT_DISTRIBUTED:
+        raise ValueError("Please launch distributed training with `llamafactory-cli` or `torchrun`.")
+
+    if training_args.deepspeed and training_args.parallel_mode != ParallelMode.DISTRIBUTED:
+        raise ValueError("Please use `FORCE_TORCHRUN=1` to launch DeepSpeed training.")
+
+    if training_args.max_steps == -1 and data_args.streaming:
+        raise ValueError("Please specify `max_steps` in streaming mode.")
+
+    if training_args.do_train and data_args.dataset is None:
+        raise ValueError("Please specify dataset for training.")
+
+    if (training_args.do_eval or training_args.do_predict or training_args.predict_with_generate) and (
+        data_args.eval_dataset is None and data_args.val_size < 1e-6
+    ):
+        raise ValueError("Please make sure eval_dataset be provided or val_size >1e-6")
+
+    if training_args.predict_with_generate:
+        if is_deepspeed_zero3_enabled():
+            raise ValueError("`predict_with_generate` is incompatible with DeepSpeed ZeRO-3.")
+
+        if finetuning_args.compute_accuracy:
+            raise ValueError("Cannot use `predict_with_generate` and `compute_accuracy` together.")
+
+    if training_args.do_train and model_args.quantization_device_map == "auto":
+        raise ValueError("Cannot use device map for quantized models in training.")
+
+    if finetuning_args.pissa_init and is_deepspeed_zero3_enabled():
+        raise ValueError("Please use scripts/pissa_init.py to initialize PiSSA in DeepSpeed ZeRO-3.")
+
+    if finetuning_args.pure_bf16:
+        if not (is_torch_bf16_gpu_available() or (is_torch_npu_available() and torch.npu.is_bf16_supported())):
+            raise ValueError("This device does not support `pure_bf16`.")
+
+        if is_deepspeed_zero3_enabled():
+            raise ValueError("`pure_bf16` is incompatible with DeepSpeed ZeRO-3.")
+
+    if training_args.parallel_mode == ParallelMode.DISTRIBUTED:
+        if finetuning_args.use_galore and finetuning_args.galore_layerwise:
+            raise ValueError("Distributed training does not support layer-wise GaLore.")
+
+        if finetuning_args.use_apollo and finetuning_args.apollo_layerwise:
+            raise ValueError("Distributed training does not support layer-wise APOLLO.")
+
+        if finetuning_args.use_badam:
+            if finetuning_args.badam_mode == "ratio":
+                raise ValueError("Radio-based BAdam does not yet support distributed training, use layer-wise BAdam.")
+            elif not is_deepspeed_zero3_enabled():
+                raise ValueError("Layer-wise BAdam only supports DeepSpeed ZeRO-3 training.")
+
+    if training_args.deepspeed is not None and (finetuning_args.use_galore or finetuning_args.use_apollo):
+        raise ValueError("GaLore and APOLLO are incompatible with DeepSpeed yet.")
+
+    if not finetuning_args.use_mca and training_args.fp8 and model_args.quantization_bit is not None:
+        raise ValueError("FP8 training is not compatible with quantization. Please disable one of them.")
+
+    if model_args.infer_backend != EngineName.HF:
+        raise ValueError("vLLM/SGLang backend is only available for API, CLI and Web.")
+
+    if model_args.use_unsloth and is_deepspeed_zero3_enabled():
+        raise ValueError("Unsloth is incompatible with DeepSpeed ZeRO-3.")
+
+    if model_args.use_kt and is_deepspeed_zero3_enabled():
+        raise ValueError("KTransformers is incompatible with DeepSpeed ZeRO-3.")
+
+    if data_args.neat_packing and is_transformers_version_greater_than("4.53.0"):
+        raise ValueError("Neat packing is incompatible with transformers>=4.53.0.")
+
+    _set_env_vars()
+    _verify_model_args(model_args, data_args, finetuning_args)
+    _check_extra_dependencies(model_args, finetuning_args, training_args)
+
+    if not finetuning_args.use_mca and training_args.fp8_enable_fsdp_float8_all_gather and not training_args.fp8:
+        logger.warning_rank0("fp8_enable_fsdp_float8_all_gather requires fp8=True. Setting fp8=True.")
+        model_args.fp8 = True
+
+    if (
+        training_args.do_train
+        and finetuning_args.finetuning_type == "lora"
+        and model_args.quantization_bit is None
+        and model_args.resize_vocab
+        and finetuning_args.additional_target is None
+    ):
+        logger.warning_rank0(
+            "Remember to add embedding layers to `additional_target` to make the added tokens trainable."
+        )
+
+    if training_args.do_train and model_args.quantization_bit is not None and (not model_args.upcast_layernorm):
+        logger.warning_rank0("We recommend enable `upcast_layernorm` in quantized training.")
+
+    if training_args.do_train and (not training_args.fp16) and (not training_args.bf16):
+        logger.warning_rank0("We recommend enable mixed precision training.")
+
+    if (
+        training_args.do_train
+        and (finetuning_args.use_galore or finetuning_args.use_apollo)
+        and not finetuning_args.pure_bf16
+    ):
+        logger.warning_rank0(
+            "Using GaLore or APOLLO with mixed precision training may significantly increases GPU memory usage."
+        )
+
+    if (not training_args.do_train) and model_args.quantization_bit is not None:
+        logger.warning_rank0("Evaluating model in 4/8-bit mode may cause lower scores.")
+
+    if (not training_args.do_train) and finetuning_args.stage == "dpo" and finetuning_args.ref_model is None:
+        logger.warning_rank0("Specify `ref_model` for computing rewards at evaluation.")
+
+    # Post-process training arguments
+    training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len
+    training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams
+    training_args.remove_unused_columns = False  # important for multimodal dataset
+
+    if finetuning_args.finetuning_type == "lora":
+        # https://github.com/huggingface/transformers/blob/v4.50.0/src/transformers/trainer.py#L782
+        training_args.label_names = training_args.label_names or ["labels"]
+
+    if "swanlab" in training_args.report_to and finetuning_args.use_swanlab:
+        training_args.report_to.remove("swanlab")
+
+    if (
+        training_args.parallel_mode == ParallelMode.DISTRIBUTED
+        and training_args.ddp_find_unused_parameters is None
+        and finetuning_args.finetuning_type == "lora"
+    ):
+        logger.info_rank0("Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.")
+        training_args.ddp_find_unused_parameters = False
+
+    if finetuning_args.stage in ["rm", "ppo"] and finetuning_args.finetuning_type in ["full", "freeze"]:
+        can_resume_from_checkpoint = False
+        if training_args.resume_from_checkpoint is not None:
+            logger.warning_rank0("Cannot resume from checkpoint in current stage.")
+            training_args.resume_from_checkpoint = None
+    else:
+        can_resume_from_checkpoint = True
+
+    if (
+        training_args.resume_from_checkpoint is None
+        and training_args.do_train
+        and os.path.isdir(training_args.output_dir)
+        and not training_args.overwrite_output_dir
+        and can_resume_from_checkpoint
+    ):
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and any(
+            os.path.isfile(os.path.join(training_args.output_dir, name)) for name in CHECKPOINT_NAMES
+        ):
+            raise ValueError("Output directory already exists and is not empty. Please set `overwrite_output_dir`.")
+
+        if last_checkpoint is not None:
+            training_args.resume_from_checkpoint = last_checkpoint
+            logger.info_rank0(f"Resuming training from {training_args.resume_from_checkpoint}.")
+            logger.info_rank0("Change `output_dir` or use `overwrite_output_dir` to avoid.")
+
+    if (
+        finetuning_args.stage in ["rm", "ppo"]
+        and finetuning_args.finetuning_type == "lora"
+        and training_args.resume_from_checkpoint is not None
+    ):
+        logger.warning_rank0(
+            f"Add {training_args.resume_from_checkpoint} to `adapter_name_or_path` to resume training from checkpoint."
+        )
+
+    # Post-process model arguments
+    if training_args.bf16 or finetuning_args.pure_bf16:
+        model_args.compute_dtype = torch.bfloat16
+    elif training_args.fp16:
+        model_args.compute_dtype = torch.float16
+
+    model_args.device_map = {"": get_current_device()}
+    model_args.model_max_length = data_args.cutoff_len
+    model_args.block_diag_attn = data_args.neat_packing
+    data_args.packing = data_args.packing if data_args.packing is not None else finetuning_args.stage == "pt"
+
+    # Log on each process the small summary
+    logger.info(
+        f"Process rank: {training_args.process_index}, "
+        f"world size: {training_args.world_size}, device: {training_args.device}, "
+        f"distributed training: {training_args.parallel_mode == ParallelMode.DISTRIBUTED}, "
+        f"compute dtype: {str(model_args.compute_dtype)}"
+    )
+    transformers.set_seed(training_args.seed)
+
+    return model_args, data_args, training_args, finetuning_args, generating_args
+
+
+def get_infer_args(args: dict[str, Any] | list[str] | None = None) -> _INFER_CLS:
+    model_args, data_args, finetuning_args, generating_args = _parse_infer_args(args)
+
+    # Setup logging
+    _set_transformers_logging()
+
+    # Check arguments
+    if model_args.infer_backend == "vllm":
+        if finetuning_args.stage != "sft":
+            raise ValueError("vLLM engine only supports auto-regressive models.")
+
+        if model_args.quantization_bit is not None:
+            raise ValueError("vLLM engine does not support bnb quantization (GPTQ and AWQ are supported).")
+
+        if model_args.rope_scaling is not None:
+            raise ValueError("vLLM engine does not support RoPE scaling.")
+
+        if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1:
+            raise ValueError("vLLM only accepts a single adapter. Merge them first.")
+
+    _set_env_vars()
+    _verify_model_args(model_args, data_args, finetuning_args)
+    _check_extra_dependencies(model_args, finetuning_args)
+
+    # Post-process model arguments
+    if model_args.export_dir is not None and model_args.export_device == "cpu":
+        model_args.device_map = {"": torch.device("cpu")}
+        if data_args.cutoff_len != DataArguments().cutoff_len:  # override cutoff_len if it is not default
+            model_args.model_max_length = data_args.cutoff_len
+    else:
+        model_args.device_map = "auto"
+
+    return model_args, data_args, finetuning_args, generating_args
+
+
+def get_eval_args(args: dict[str, Any] | list[str] | None = None) -> _EVAL_CLS:
+    model_args, data_args, eval_args, finetuning_args = _parse_eval_args(args)
+
+    # Setup logging
+    _set_transformers_logging()
+
+    # Check arguments
+    if model_args.infer_backend != EngineName.HF:
+        raise ValueError("vLLM/SGLang backend is only available for API, CLI and Web.")
+
+    _set_env_vars()
+    _verify_model_args(model_args, data_args, finetuning_args)
+    _check_extra_dependencies(model_args, finetuning_args)
+
+    model_args.device_map = "auto"
+
+    transformers.set_seed(eval_args.seed)
+
+    return model_args, data_args, eval_args, finetuning_args
diff --git a/LlamaFactory/src/llamafactory/hparams/training_args.py b/LlamaFactory/src/llamafactory/hparams/training_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf975f0d29a8540b1c9126e9ce700c9b64a41dfd
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/hparams/training_args.py
@@ -0,0 +1,100 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from dataclasses import dataclass, field
+
+from transformers import Seq2SeqTrainingArguments
+from transformers.training_args import _convert_str_dict
+
+from ..extras.misc import is_env_enabled, use_ray
+from ..extras.packages import is_mcore_adapter_available
+
+
+if is_env_enabled("USE_MCA"):
+    if not is_mcore_adapter_available():
+        raise ImportError(
+            "mcore_adapter is required when USE_MCA=1. Please install `mcore_adapter` and its dependencies."
+        )
+
+    from mcore_adapter import Seq2SeqTrainingArguments as McaSeq2SeqTrainingArguments
+
+    BaseTrainingArguments = McaSeq2SeqTrainingArguments
+else:
+    BaseTrainingArguments = Seq2SeqTrainingArguments
+
+
+@dataclass
+class RayArguments:
+    r"""Arguments pertaining to the Ray training."""
+
+    ray_num_workers: int = field(
+        default=1,
+        metadata={"help": "The number of workers for Ray training. Default is 1 worker."},
+    )
+    ray_init_kwargs: dict | str | None = field(
+        default=None,
+        metadata={"help": "The arguments to pass to ray.init for Ray training. Default is None."},
+    )
+    master_addr: str | None = field(
+        default=None,
+        metadata={"help": "The master address for init_process_group"},
+    )
+    master_port: str | None = field(
+        default=None,
+        metadata={"help": "The master port for init_process_group"},
+    )
+
+    def __post_init__(self):
+        self.use_ray = use_ray()
+
+        if isinstance(self.ray_init_kwargs, str) and self.ray_init_kwargs.startswith("{"):
+            self.ray_init_kwargs = _convert_str_dict(json.loads(self.ray_init_kwargs))
+
+
+@dataclass
+class Fp8Arguments:
+    r"""Arguments pertaining to the FP8 training."""
+
+    fp8: bool = field(
+        default=False,
+        metadata={
+            "help": "Enable FP8 mixed precision training via HuggingFace Accelerate. "
+            "Requires PyTorch 2.7+ and Hopper architecture GPUs."
+        },
+    )
+    fp8_backend: str = field(
+        default="auto",
+        metadata={
+            "help": "FP8 backend to use ('auto', 'torchao', 'te', 'msamp'). 'auto' selects best available backend."
+        },
+    )
+    fp8_enable_fsdp_float8_all_gather: bool = field(
+        default=False,
+        metadata={"help": "Enable FP8 optimizations for FSDP2 all-gather operations."},
+    )
+
+
+@dataclass
+class TrainingArguments(Fp8Arguments, RayArguments, BaseTrainingArguments):
+    r"""Arguments pertaining to the trainer."""
+
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={"help": "deprecated"},
+    )
+
+    def __post_init__(self):
+        RayArguments.__post_init__(self)
+        BaseTrainingArguments.__post_init__(self)
diff --git a/LlamaFactory/src/llamafactory/launcher.py b/LlamaFactory/src/llamafactory/launcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ccc06f4a9d2dbc818b4faa08160d2fe1b5698bc
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/launcher.py
@@ -0,0 +1,185 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+from copy import deepcopy
+
+
+USAGE = (
+    "-" * 70
+    + "\n"
+    + "| Usage:                                                             |\n"
+    + "|   llamafactory-cli api -h: launch an OpenAI-style API server       |\n"
+    + "|   llamafactory-cli chat -h: launch a chat interface in CLI         |\n"
+    + "|   llamafactory-cli export -h: merge LoRA adapters and export model |\n"
+    + "|   llamafactory-cli train -h: train models                          |\n"
+    + "|   llamafactory-cli webchat -h: launch a chat interface in Web UI   |\n"
+    + "|   llamafactory-cli webui: launch LlamaBoard                        |\n"
+    + "|   llamafactory-cli env: show environment info                      |\n"
+    + "|   llamafactory-cli version: show version info                      |\n"
+    + "| Hint: You can use `lmf` as a shortcut for `llamafactory-cli`.      |\n"
+    + "-" * 70
+)
+
+
+def launch():
+    from .extras import logging
+    from .extras.env import VERSION, print_env
+    from .extras.misc import find_available_port, get_device_count, is_env_enabled, use_kt, use_ray
+
+    logger = logging.get_logger(__name__)
+    WELCOME = (
+        "-" * 58
+        + "\n"
+        + f"| Welcome to LLaMA Factory, version {VERSION}"
+        + " " * (21 - len(VERSION))
+        + "|\n|"
+        + " " * 56
+        + "|\n"
+        + "| Project page: https://github.com/hiyouga/LLaMA-Factory |\n"
+        + "-" * 58
+    )
+
+    command = sys.argv.pop(1) if len(sys.argv) > 1 else "help"
+    if is_env_enabled("USE_MCA"):  # force use torchrun
+        os.environ["FORCE_TORCHRUN"] = "1"
+
+    if command == "train" and (
+        is_env_enabled("FORCE_TORCHRUN") or (get_device_count() > 1 and not use_ray() and not use_kt())
+    ):
+        # launch distributed training
+        nnodes = os.getenv("NNODES", "1")
+        node_rank = os.getenv("NODE_RANK", "0")
+        nproc_per_node = os.getenv("NPROC_PER_NODE", str(get_device_count()))
+        master_addr = os.getenv("MASTER_ADDR", "127.0.0.1")
+        master_port = os.getenv("MASTER_PORT", str(find_available_port()))
+        logger.info_rank0(f"Initializing {nproc_per_node} distributed tasks at: {master_addr}:{master_port}")
+        if int(nnodes) > 1:
+            logger.info_rank0(f"Multi-node training enabled: num nodes: {nnodes}, node rank: {node_rank}")
+
+        # elastic launch support
+        max_restarts = os.getenv("MAX_RESTARTS", "0")
+        rdzv_id = os.getenv("RDZV_ID")
+        min_nnodes = os.getenv("MIN_NNODES")
+        max_nnodes = os.getenv("MAX_NNODES")
+
+        env = deepcopy(os.environ)
+        if is_env_enabled("OPTIM_TORCH", "1"):
+            # optimize DDP, see https://zhuanlan.zhihu.com/p/671834539
+            env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+            env["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+        if rdzv_id is not None:
+            # launch elastic job with fault tolerant support when possible
+            # see also https://docs.pytorch.org/docs/stable/elastic/train_script.html
+            rdzv_nnodes = nnodes
+            # elastic number of nodes if MIN_NNODES and MAX_NNODES are set
+            if min_nnodes is not None and max_nnodes is not None:
+                rdzv_nnodes = f"{min_nnodes}:{max_nnodes}"
+
+            process = subprocess.run(
+                (
+                    "torchrun --nnodes {rdzv_nnodes} --nproc-per-node {nproc_per_node} "
+                    "--rdzv-id {rdzv_id} --rdzv-backend c10d --rdzv-endpoint {master_addr}:{master_port} "
+                    "--max-restarts {max_restarts} {file_name} {args}"
+                )
+                .format(
+                    rdzv_nnodes=rdzv_nnodes,
+                    nproc_per_node=nproc_per_node,
+                    rdzv_id=rdzv_id,
+                    master_addr=master_addr,
+                    master_port=master_port,
+                    max_restarts=max_restarts,
+                    file_name=__file__,
+                    args=" ".join(sys.argv[1:]),
+                )
+                .split(),
+                env=env,
+                check=True,
+            )
+        else:
+            # NOTE: DO NOT USE shell=True to avoid security risk
+            process = subprocess.run(
+                (
+                    "torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
+                    "--master_addr {master_addr} --master_port {master_port} {file_name} {args}"
+                )
+                .format(
+                    nnodes=nnodes,
+                    node_rank=node_rank,
+                    nproc_per_node=nproc_per_node,
+                    master_addr=master_addr,
+                    master_port=master_port,
+                    file_name=__file__,
+                    args=" ".join(sys.argv[1:]),
+                )
+                .split(),
+                env=env,
+                check=True,
+            )
+
+        sys.exit(process.returncode)
+
+    elif command == "api":
+        from .api.app import run_api
+
+        run_api()
+
+    elif command == "chat":
+        from .chat.chat_model import run_chat
+
+        run_chat()
+
+    elif command == "eval":
+        raise NotImplementedError("Evaluation will be deprecated in the future.")
+
+    elif command == "export":
+        from .train.tuner import export_model
+
+        export_model()
+
+    elif command == "train":
+        from .train.tuner import run_exp
+
+        run_exp()
+
+    elif command == "webchat":
+        from .webui.interface import run_web_demo
+
+        run_web_demo()
+
+    elif command == "webui":
+        from .webui.interface import run_web_ui
+
+        run_web_ui()
+
+    elif command == "env":
+        print_env()
+
+    elif command == "version":
+        print(WELCOME)
+
+    elif command == "help":
+        print(USAGE)
+
+    else:
+        print(f"Unknown command: {command}.\n{USAGE}")
+
+
+if __name__ == "__main__":
+    from llamafactory.train.tuner import run_exp  # use absolute import
+
+    run_exp()
diff --git a/LlamaFactory/src/llamafactory/model/__init__.py b/LlamaFactory/src/llamafactory/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d4f47f43273457d15e38e3454c5f4bc156da3d
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .loader import load_config, load_model, load_tokenizer
+from .model_utils.misc import find_all_linear_modules
+from .model_utils.quantization import QuantizationMethod
+from .model_utils.valuehead import load_valuehead_params
+
+
+__all__ = [
+    "QuantizationMethod",
+    "find_all_linear_modules",
+    "load_config",
+    "load_model",
+    "load_tokenizer",
+    "load_valuehead_params",
+]
diff --git a/LlamaFactory/src/llamafactory/model/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7c717323cb7583b48c6b019a72b3c51d18400e0
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/__pycache__/__init__.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..adabf8f7d1e223691f617b86543e501b947ee58c
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/__pycache__/__init__.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/__pycache__/adapter.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/__pycache__/adapter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1671ebe86a71fc962a94523d4618cde5b4f53a7
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/__pycache__/adapter.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/__pycache__/adapter.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/__pycache__/adapter.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36e27d0fd882e48f1a8f082ff283d3c1a90a99d9
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/__pycache__/adapter.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/__pycache__/loader.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/__pycache__/loader.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2a4437eef7aafb1cb712265abe61121946de429
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/__pycache__/loader.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/__pycache__/loader.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/__pycache__/loader.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..281bc772f3d9b9a4a816713fb23f9725d482505b
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/__pycache__/loader.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/__pycache__/patcher.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/__pycache__/patcher.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2db86fd7df1d8db969c7047083bc352f4ab12e0
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/__pycache__/patcher.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/__pycache__/patcher.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/__pycache__/patcher.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0e9739ea578bbbe4753c8fe871423249ae8eb9a
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/__pycache__/patcher.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/adapter.py b/LlamaFactory/src/llamafactory/model/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..708db9429c9e95f056ed4491ef0ee381d70b7e58
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/adapter.py
@@ -0,0 +1,366 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import TYPE_CHECKING
+
+import torch
+from peft import LoraConfig, LoraModel, OFTConfig, PeftModel, TaskType, get_peft_model
+from transformers.integrations import is_deepspeed_zero3_enabled
+
+from ..extras import logging
+from ..extras.constants import EngineName
+from .model_utils.ktransformers import get_kt_peft_model, load_kt_peft_model
+from .model_utils.misc import find_all_linear_modules, find_expanded_modules
+from .model_utils.quantization import QuantizationMethod
+from .model_utils.unsloth import get_unsloth_peft_model, load_unsloth_peft_model
+from .model_utils.visual import COMPOSITE_MODELS, get_forbidden_modules, patch_target_modules
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel
+
+    from ..hparams import FinetuningArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def _setup_full_tuning(
+    model: "PreTrainedModel",
+    finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
+    cast_trainable_params_to_fp32: bool,
+) -> None:
+    if not is_trainable:
+        return
+
+    logger.info_rank0("Fine-tuning method: Full")
+    forbidden_modules = get_forbidden_modules(model.config, finetuning_args)
+    for name, param in model.named_parameters():
+        if not any(forbidden_module in name for forbidden_module in forbidden_modules):
+            if cast_trainable_params_to_fp32:
+                param.data = param.data.to(torch.float32)
+        else:
+            param.requires_grad_(False)
+
+
+def _setup_freeze_tuning(
+    model: "PreTrainedModel",
+    finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
+    cast_trainable_params_to_fp32: bool,
+) -> None:
+    if not is_trainable:
+        return
+
+    logger.info_rank0("Fine-tuning method: Freeze")
+    if hasattr(model.config, "text_config"):  # composite models
+        config = getattr(model.config, "text_config")
+    else:
+        config = model.config
+
+    num_layers = (
+        getattr(config, "num_hidden_layers", None)
+        or getattr(config, "num_layers", None)
+        or getattr(config, "n_layer", None)
+    )
+    if not num_layers:
+        raise ValueError("Current model does not support freeze tuning.")
+
+    if finetuning_args.use_llama_pro:
+        if num_layers % finetuning_args.freeze_trainable_layers != 0:
+            raise ValueError(
+                f"`num_layers` {num_layers} should be "
+                f"divisible by `num_layer_trainable` {finetuning_args.freeze_trainable_layers}."
+            )
+
+        stride = num_layers // finetuning_args.freeze_trainable_layers
+        trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride)
+    elif finetuning_args.freeze_trainable_layers > 0:  # fine-tuning the last n layers if num_layer_trainable > 0
+        trainable_layer_ids = range(max(0, num_layers - finetuning_args.freeze_trainable_layers), num_layers)
+    else:  # fine-tuning the first n layers if num_layer_trainable < 0
+        trainable_layer_ids = range(min(-finetuning_args.freeze_trainable_layers, num_layers))
+
+    hidden_modules = set()
+    non_hidden_modules = set()
+    for name, _ in model.named_parameters():
+        if ".0." in name:
+            hidden_modules.add(name.split(".0.")[-1].split(".")[0])
+        elif ".1." in name:  # MoD starts from layer 1
+            hidden_modules.add(name.split(".1.")[-1].split(".")[0])
+
+        if re.search(r"\.\d+\.", name) is None:
+            non_hidden_modules.add(name.split(".")[-2])  # remove weight/bias
+
+    trainable_layers = []
+    for module_name in finetuning_args.freeze_trainable_modules:
+        if module_name != "all" and module_name not in hidden_modules:
+            raise ValueError(
+                "Module {} is not found, please choose from {}".format(module_name, ", ".join(hidden_modules))
+            )
+
+        for idx in trainable_layer_ids:
+            trainable_layers.append(".{:d}.{}".format(idx, module_name if module_name != "all" else ""))
+
+    if finetuning_args.freeze_extra_modules:
+        for module_name in finetuning_args.freeze_extra_modules:
+            if module_name not in non_hidden_modules:
+                raise ValueError(
+                    "Module {} is not found, please choose from {}".format(module_name, ", ".join(non_hidden_modules))
+                )
+
+            trainable_layers.append(module_name)
+
+    model_type = getattr(model.config, "model_type", None)
+    if not finetuning_args.freeze_multi_modal_projector and model_type in COMPOSITE_MODELS:
+        trainable_layers.append(COMPOSITE_MODELS[model_type].projector_key)
+
+    forbidden_modules = get_forbidden_modules(model.config, finetuning_args)
+    for name, param in model.named_parameters():
+        if any(trainable_layer in name for trainable_layer in trainable_layers) and not any(
+            forbidden_module in name for forbidden_module in forbidden_modules
+        ):
+            if cast_trainable_params_to_fp32:
+                param.data = param.data.to(torch.float32)
+        else:
+            param.requires_grad_(False)
+
+    logger.info_rank0("Set trainable layers: {}".format(",".join(trainable_layers)))
+
+
+def _setup_lora_tuning(
+    config: "PretrainedConfig",
+    model: "PreTrainedModel",
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
+    cast_trainable_params_to_fp32: bool,
+) -> "PeftModel":
+    if is_trainable:
+        if finetuning_args.finetuning_type == "oft":
+            logger.info_rank0("Fine-tuning method: OFT")
+        else:
+            logger.info_rank0("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA"))
+
+    adapter_to_resume = None
+
+    if model_args.adapter_name_or_path is not None:
+        is_mergeable = True
+        if getattr(model, "quantization_method", None):  # merge lora in quantized model is unstable
+            assert len(model_args.adapter_name_or_path) == 1, "Quantized model only accepts a single adapter."
+            is_mergeable = False
+
+        if is_deepspeed_zero3_enabled():
+            assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3."
+            is_mergeable = False
+
+        if model_args.use_kt:
+            assert len(model_args.adapter_name_or_path) == 1, "KTransformers model only accepts a single adapter"
+            is_mergeable = False
+
+        if model_args.use_unsloth:
+            assert len(model_args.adapter_name_or_path) == 1, "Unsloth model only accepts a single adapter."
+            is_mergeable = False
+
+        if (is_trainable and not finetuning_args.create_new_adapter) or (not is_mergeable):
+            adapter_to_merge = model_args.adapter_name_or_path[:-1]
+            adapter_to_resume = model_args.adapter_name_or_path[-1]
+        else:
+            adapter_to_merge = model_args.adapter_name_or_path
+
+        init_kwargs = {
+            "subfolder": model_args.adapter_folder,
+            "offload_folder": model_args.offload_folder,
+            "cache_dir": model_args.cache_dir,
+            "revision": model_args.model_revision,
+            "token": model_args.hf_hub_token,
+        }
+
+        if model_args.use_kt:
+            if model_args.infer_backend != EngineName.KT:
+                raise ValueError(
+                    "We should use ktransformers as backend to infer the adapter fine-tuned by ktransformers."
+                )
+
+        for adapter in adapter_to_merge:
+            model: LoraModel = PeftModel.from_pretrained(model, adapter, **init_kwargs)
+            model = model.merge_and_unload()
+
+        if len(adapter_to_merge) > 0:
+            logger.info_rank0(f"Merged {len(adapter_to_merge)} adapter(s).")
+
+        if adapter_to_resume is not None:  # resume lora training
+            if model_args.use_kt:
+                model = load_kt_peft_model(model_args, model)
+            elif model_args.use_unsloth:
+                model = load_unsloth_peft_model(config, model_args, finetuning_args, is_trainable=is_trainable)
+            else:
+                model = PeftModel.from_pretrained(model, adapter_to_resume, is_trainable=is_trainable, **init_kwargs)
+
+        logger.info_rank0("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
+
+    if is_trainable and adapter_to_resume is None:  # create new lora weights while training
+        if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all":
+            target_modules = find_all_linear_modules(model, finetuning_args.freeze_vision_tower)
+        else:
+            target_modules = finetuning_args.lora_target
+
+        if model_args.use_kt:
+            new_list = []
+            for m in target_modules:
+                if m in ("down_proj", "up_proj", "gate_proj"):
+                    new_list.extend([f"mlp.{m}", f"shared_experts.{m}"])
+                elif m not in ("generate_linear", "orig_module", "prefill_linear"):
+                    new_list.append(m)
+
+            target_modules[:] = new_list
+
+        if finetuning_args.use_llama_pro:
+            target_modules = find_expanded_modules(model, target_modules, finetuning_args.freeze_trainable_layers)
+
+        target_modules = patch_target_modules(model, finetuning_args, target_modules)
+
+        if (
+            finetuning_args.use_dora
+            and getattr(model, "quantization_method", None) is not None
+            and getattr(model, "quantization_method", None) != QuantizationMethod.BNB
+        ):
+            raise ValueError("DoRA is not compatible with PTQ-quantized models.")
+
+        if model_args.resize_vocab and finetuning_args.additional_target is None:
+            input_embeddings = model.get_input_embeddings()
+            output_embeddings = model.get_output_embeddings()
+            module_names = set()
+            for name, module in model.named_modules():
+                if module in [input_embeddings, output_embeddings]:
+                    module_names.add(name.split(".")[-1])
+
+            finetuning_args.additional_target = module_names
+            logger.warning_rank0("Vocab has been resized, add {} to trainable params.".format(",".join(module_names)))
+
+        if finetuning_args.finetuning_type == "lora":
+            peft_kwargs = {
+                "r": finetuning_args.lora_rank,
+                "target_modules": target_modules,
+                "lora_alpha": finetuning_args.lora_alpha,
+                "lora_dropout": finetuning_args.lora_dropout,
+                "use_rslora": finetuning_args.use_rslora,
+                "use_dora": finetuning_args.use_dora,
+                "modules_to_save": finetuning_args.additional_target,
+            }
+        elif finetuning_args.finetuning_type == "oft":
+            peft_kwargs = {
+                "r": finetuning_args.oft_rank,
+                "oft_block_size": finetuning_args.oft_block_size,
+                "target_modules": target_modules,
+                "module_dropout": finetuning_args.module_dropout,
+                "modules_to_save": finetuning_args.additional_target,
+            }
+
+        if model_args.use_kt:
+            if finetuning_args.finetuning_type == "oft":
+                raise ValueError("KTransformers is currently not supported for OFT.")
+            if finetuning_args.finetuning_type == "lora":
+                peft_config = LoraConfig(
+                    task_type=TaskType.CAUSAL_LM,
+                    inference_mode=False,
+                    **peft_kwargs,
+                )
+            else:
+                raise ValueError("KTransformers is currently only supported for LoRA.")
+
+            model = get_kt_peft_model(model, peft_config)
+            print(f"KT_model:{model}")
+        elif model_args.use_unsloth:
+            if finetuning_args.finetuning_type == "oft":
+                raise ValueError("Unsloth is currently not supported for OFT.")
+
+            model = get_unsloth_peft_model(model, model_args, peft_kwargs)
+        else:
+            if finetuning_args.pissa_init:
+                if finetuning_args.pissa_iter == -1:
+                    logger.info_rank0("Using PiSSA initialization.")
+                    peft_kwargs["init_lora_weights"] = "pissa"
+                else:
+                    logger.info_rank0(f"Using PiSSA initialization with FSVD steps {finetuning_args.pissa_iter}.")
+                    peft_kwargs["init_lora_weights"] = f"pissa_niter_{finetuning_args.pissa_iter}"
+
+            if finetuning_args.finetuning_type == "lora":
+                peft_config = LoraConfig(
+                    task_type=TaskType.CAUSAL_LM,
+                    inference_mode=False,
+                    **peft_kwargs,
+                )
+            elif finetuning_args.finetuning_type == "oft":
+                peft_config = OFTConfig(
+                    task_type=TaskType.CAUSAL_LM,
+                    inference_mode=False,
+                    **peft_kwargs,
+                )
+            model = get_peft_model(model, peft_config)
+
+    if is_trainable and cast_trainable_params_to_fp32:
+        for param in filter(lambda p: p.requires_grad, model.parameters()):
+            param.data = param.data.to(torch.float32)
+
+    return model
+
+
+def init_adapter(
+    config: "PretrainedConfig",
+    model: "PreTrainedModel",
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
+) -> "PreTrainedModel":
+    r"""Initialize the adapters.
+
+    Support full-parameter, freeze and LoRA training.
+
+    Note that the trainable parameters must be cast to float32.
+    """
+    if is_trainable and getattr(model, "quantization_method", None) is not None:
+        if finetuning_args.finetuning_type not in ["lora", "oft"]:
+            raise ValueError("Quantized models can only be used for the LoRA or OFT tuning.")
+
+        if finetuning_args.pissa_init:
+            raise ValueError("Cannot initialize PiSSA adapter on quantized models.")
+
+    # cast trainable parameters to float32 if:
+    # 1. is_trainable and not pure_bf16 and not badam and quantization_bit is not None (qlora)
+    # 2. is_trainable and not pure_bf16 and not badam and not zero3 (zero3 already in fp32)
+    cast_trainable_params_to_fp32 = False
+    if not is_trainable:
+        pass
+    elif finetuning_args.pure_bf16 or finetuning_args.use_badam:
+        logger.info_rank0("Pure bf16 / BAdam detected, remaining trainable params in half precision.")
+    elif model_args.quantization_bit is None and is_deepspeed_zero3_enabled():
+        logger.info_rank0("DeepSpeed ZeRO3 detected, remaining trainable params in float32.")
+    else:
+        logger.info_rank0("Upcasting trainable params to float32.")
+        cast_trainable_params_to_fp32 = True
+
+    if finetuning_args.finetuning_type == "full":
+        _setup_full_tuning(model, finetuning_args, is_trainable, cast_trainable_params_to_fp32)
+    elif finetuning_args.finetuning_type == "freeze":
+        _setup_freeze_tuning(model, finetuning_args, is_trainable, cast_trainable_params_to_fp32)
+    elif finetuning_args.finetuning_type in ["lora", "oft"]:
+        model = _setup_lora_tuning(
+            config, model, model_args, finetuning_args, is_trainable, cast_trainable_params_to_fp32
+        )
+    else:
+        raise NotImplementedError(f"Unknown finetuning type: {finetuning_args.finetuning_type}.")
+
+    return model
diff --git a/LlamaFactory/src/llamafactory/model/loader.py b/LlamaFactory/src/llamafactory/model/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..d838d217635faabade08db42f6dc2bd1c4a5a343
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/loader.py
@@ -0,0 +1,247 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import TYPE_CHECKING, Any, Optional, TypedDict
+
+import torch
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoModelForImageTextToText,
+    AutoModelForSeq2SeqLM,
+    AutoModelForTextToWaveform,
+    AutoProcessor,
+    AutoTokenizer,
+)
+from trl import AutoModelForCausalLMWithValueHead
+
+from ..extras import logging
+from ..extras.misc import count_parameters, skip_check_imports, try_download_model_from_other_hub
+from ..extras.packages import is_torch_version_greater_than
+from .adapter import init_adapter
+from .model_utils.ktransformers import load_kt_pretrained_model
+from .model_utils.liger_kernel import apply_liger_kernel
+from .model_utils.misc import register_autoclass
+from .model_utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model
+from .model_utils.unsloth import load_unsloth_pretrained_model
+from .model_utils.valuehead import load_valuehead_params
+from .patcher import patch_config, patch_model, patch_processor, patch_tokenizer, patch_valuehead_model
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
+
+    from ..hparams import FinetuningArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class TokenizerModule(TypedDict):
+    tokenizer: "PreTrainedTokenizer"
+    processor: Optional["ProcessorMixin"]
+
+
+def _get_init_kwargs(model_args: "ModelArguments") -> dict[str, Any]:
+    r"""Get arguments to load config/tokenizer/model.
+
+    Note: including inplace operation of model_args.
+    """
+    skip_check_imports()
+    model_args.model_name_or_path = try_download_model_from_other_hub(model_args)
+    return {
+        "trust_remote_code": model_args.trust_remote_code,
+        "cache_dir": model_args.cache_dir,
+        "revision": model_args.model_revision,
+        "token": model_args.hf_hub_token,
+    }
+
+
+def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule":
+    r"""Load pretrained tokenizer and optionally loads processor.
+
+    Note: including inplace operation of model_args.
+    """
+    init_kwargs = _get_init_kwargs(model_args)
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            use_fast=model_args.use_fast_tokenizer,
+            split_special_tokens=model_args.split_special_tokens,
+            padding_side="right",
+            **init_kwargs,
+        )
+    except ValueError:  # try another one
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            use_fast=not model_args.use_fast_tokenizer,
+            padding_side="right",
+            **init_kwargs,
+        )
+    except Exception as e:
+        raise OSError("Failed to load tokenizer.") from e
+
+    patch_tokenizer(tokenizer, model_args)
+
+    try:
+        processor = AutoProcessor.from_pretrained(
+            model_args.model_name_or_path,
+            use_fast=model_args.use_fast_tokenizer,
+            **init_kwargs,
+        )
+    except ValueError:  # try another one
+        processor = AutoProcessor.from_pretrained(
+            model_args.model_name_or_path,
+            use_fast=not model_args.use_fast_tokenizer,
+            **init_kwargs,
+        )
+    except Exception as e:
+        logger.info_rank0(f"Failed to load processor: {e}.")
+        processor = None
+
+    # Avoid load tokenizer, see:
+    # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/auto/processing_auto.py#L324
+    if processor is not None and "Processor" not in processor.__class__.__name__:
+        logger.debug("The loaded processor is not an instance of Processor. Dropping it.")
+        processor = None
+
+    if processor is not None:
+        patch_processor(processor, tokenizer, model_args)
+
+    return {"tokenizer": tokenizer, "processor": processor}
+
+
+def load_config(model_args: "ModelArguments") -> "PretrainedConfig":
+    r"""Load model config."""
+    init_kwargs = _get_init_kwargs(model_args)
+    return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
+
+
+def load_model(
+    tokenizer: "PreTrainedTokenizer",
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+    is_trainable: bool = False,
+    add_valuehead: bool = False,
+) -> "PreTrainedModel":
+    r"""Load pretrained model."""
+    init_kwargs = _get_init_kwargs(model_args)
+    config = load_config(model_args)
+    patch_config(config, tokenizer, model_args, init_kwargs, is_trainable)
+    apply_liger_kernel(config, model_args, is_trainable, require_logits=(finetuning_args.stage not in ["pt", "sft"]))
+
+    model = None
+    lazy_load = False
+    if model_args.use_kt:
+        from ktransformers.sft.monkey_patch_torch_module import install_patch
+
+        install_patch()
+        model = load_kt_pretrained_model(config, model_args)
+    elif model_args.use_unsloth:
+        if model_args.adapter_name_or_path is not None:
+            lazy_load = True
+        elif is_trainable:
+            model = load_unsloth_pretrained_model(config, model_args, finetuning_args)
+
+    if model is None and not lazy_load:
+        init_kwargs["config"] = config
+        init_kwargs["pretrained_model_name_or_path"] = model_args.model_name_or_path
+        init_kwargs["torch_dtype"] = "auto"
+
+        if model_args.mixture_of_depths == "load":
+            model = load_mod_pretrained_model(**init_kwargs)
+        else:
+            if type(config) in AutoModelForImageTextToText._model_mapping.keys():  # image-text
+                load_class = AutoModelForImageTextToText
+            elif type(config) in AutoModelForSeq2SeqLM._model_mapping.keys():  # audio-text
+                load_class = AutoModelForSeq2SeqLM
+            elif type(config) in AutoModelForTextToWaveform._model_mapping.keys():  # audio-text for qwen omni
+                load_class = AutoModelForTextToWaveform
+            else:
+                load_class = AutoModelForCausalLM
+
+            if model_args.train_from_scratch:
+                model = load_class.from_config(config, trust_remote_code=model_args.trust_remote_code)
+            else:
+                model = load_class.from_pretrained(**init_kwargs)
+                if getattr(model.config, "model_type", None) in ["qwen2_5_omni", "qwen3_omni_moe"]:
+                    model = getattr(model, "thinker")
+
+        if model_args.mixture_of_depths == "convert":
+            model = convert_pretrained_model_to_mod(model, config, model_args)
+
+    if not lazy_load:
+        patch_model(model, tokenizer, model_args, is_trainable, add_valuehead)
+        register_autoclass(config, model, tokenizer)
+
+    model = init_adapter(config, model, model_args, finetuning_args, is_trainable)
+
+    if add_valuehead:
+        model = AutoModelForCausalLMWithValueHead.from_pretrained(model)
+        patch_valuehead_model(model)
+
+        if model_args.adapter_name_or_path is not None:
+            vhead_path = model_args.adapter_name_or_path[-1]
+        else:
+            vhead_path = model_args.model_name_or_path
+
+        vhead_params = load_valuehead_params(vhead_path, model_args)
+        if vhead_params is not None:
+            model.load_state_dict(vhead_params, strict=False)
+            logger.info_rank0(f"Loaded valuehead from checkpoint: {vhead_path}")
+
+    # Conv3D is not recommended when using torch 2.9.x
+    if is_torch_version_greater_than("2.9.0") and not is_torch_version_greater_than("2.10.0"):
+        if any(isinstance(m, torch.nn.Conv3d) for m in model.modules()):
+            raise ValueError(
+                "Unsupported torch version detected: torch 2.9.x with Conv3D. "
+                "This combination is known to cause severe performance regression. "
+                "Please downgrade torch to <2.9 or remove Conv3D. "
+                "See https://github.com/pytorch/pytorch/issues/166122"
+            )
+
+    if not is_trainable:
+        model.requires_grad_(False)
+        model.eval()
+    else:
+        model.train()
+
+    # Borrowing the kernel plugins ability of v1 to temporarily apply the NPU fusion operator to v0,
+    # it is turned off by default, and can be discarded after the transition period ends.
+    if model_args.use_v1_kernels and is_trainable:
+        logger.warning_rank0(
+            "You are try to using future feature about kernels, please note that this feature "
+            "is not supported for all models. If get any error, please disable this feature, or report the issue."
+        )
+        from ..v1.plugins.model_plugins.kernels.interface import apply_default_kernels
+
+        model = apply_default_kernels(model, include_kernels=model_args.use_v1_kernels)
+
+    trainable_params, all_param = count_parameters(model)
+    if is_trainable:
+        param_stats = (
+            f"trainable params: {trainable_params:,} || "
+            f"all params: {all_param:,} || trainable%: {100 * trainable_params / all_param:.4f}"
+        )
+    else:
+        param_stats = f"all params: {all_param:,}"
+
+    logger.info_rank0(param_stats)
+
+    if model_args.print_param_status and int(os.getenv("LOCAL_RANK", "0")) == 0:
+        for name, param in model.named_parameters():
+            print(f"name: {name}, dtype: {param.dtype}, device: {param.device}, trainable: {param.requires_grad}")
+
+    return model
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__init__.py b/LlamaFactory/src/llamafactory/model/model_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..729745da67e77ed8edf78a03d81b87f83edfff6f
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/__init__.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3bc5591d641ac6c2124fa59f860f416dfe8acfb
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/__init__.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/attention.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/attention.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8d01add5a78b6a3aaeb83034ec448942f26fa3a
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/attention.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/attention.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/attention.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..351c6355480674a74b6f199851116e15d392426d
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/attention.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c013f41e22098c1c92b2a9e66a5bea4f6d35d4a
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8fba5eb16f610e5db56ca399f1a1a4a2cbadec1
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/embedding.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/embedding.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00fe79e517aa77f4319c847c0975d8bd6e461436
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/embedding.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/embedding.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/embedding.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..818d762e54b6318e3a88259902bc9c8812e6bdc5
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/embedding.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/ktransformers.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/ktransformers.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..441437ad59a9a6c2f08357119f61929be9c027d9
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/ktransformers.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/ktransformers.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/ktransformers.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..779a9025d4829556c2e72d10654f285d94480119
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/ktransformers.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4214881f03618e8315bdbd5ce623c6cd56bce887
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f10dfebf95b0350244cf88259b2fc2b59fb9081e
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4e1e321c4e71f0f5ef2ac5eebcf7b5268a80172
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6fc781e18a73ed484993742337b642ea141b8ba
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/longlora.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/longlora.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b47e9440aefa7f85545903b1af61b4c8f6bd4037
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/longlora.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/longlora.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/longlora.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e275187be107733d5d669df4bacf15e57b52231
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/longlora.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/misc.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/misc.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be20f37438c10ea32432626ba34502af98f6baf1
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/misc.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/misc.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/misc.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e055d76b45c1f443f927ae2f138574d4f8cdc006
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/misc.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/mod.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/mod.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..becd831fef933bd725830f21e5250ee1a7880cdf
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/mod.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/mod.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/mod.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99ba47bff8715bbf5724fb61c35ccb9fa052d9ae
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/mod.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/moe.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1463fb684d5e3dfdb0646cd36a181767625e45ca
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/moe.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/moe.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d2c8be345ad62866ca72f690f8349df1b06f03b
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/moe.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/packing.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/packing.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbc4a09ceceb476c917889f183ff77b855e66ff6
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/packing.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/packing.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/packing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..668a5578379a42e9e4416279bd6082d21666bc77
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/packing.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/quantization.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/quantization.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7ebf09f1c07e49b0b6cd3698dee961f2ab29d13
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/quantization.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/quantization.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/quantization.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9ad6f3ea80318ececce96d33418c6e97e010708
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/quantization.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/rope.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/rope.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4af29188099ecb77cd123b16a9da0e6c7862d27
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/rope.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/rope.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/rope.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..336f70af55dc3a75a5b7ff8fd6552a4bc98df5d6
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/rope.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/unsloth.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/unsloth.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55c0a5c4a2351e57e01b4a0a2418208264e049c7
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/unsloth.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/unsloth.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/unsloth.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d73ee9628e407df01f0373b969852b301f9c3d9
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/unsloth.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/valuehead.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/valuehead.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a648b8a628a4a79c68be57d6bd88bd7b10f1511
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/valuehead.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/valuehead.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/valuehead.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9d5ff6c289666bbecf0eff1c52d3b5e64c70a61
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/valuehead.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/visual.cpython-311.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/visual.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74575377d3269dd6c58ec6fb36cce4f3d138fc7b
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/visual.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/visual.cpython-312.pyc b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/visual.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87de450e329cc82201e5671f47185291cd5bd229
Binary files /dev/null and b/LlamaFactory/src/llamafactory/model/model_utils/__pycache__/visual.cpython-312.pyc differ
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/attention.py b/LlamaFactory/src/llamafactory/model/model_utils/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..290df74a43d386736994a6744bfc521645f51d3a
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/attention.py
@@ -0,0 +1,115 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...extras import logging
+from ...extras.constants import AttentionFunction
+from ...extras.packages import is_torch_version_greater_than
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def configure_attn_implementation(config: "PretrainedConfig", model_args: "ModelArguments") -> None:
+    from transformers.utils import is_flash_attn_2_available
+
+    if getattr(config, "model_type", None) == "gpt_oss":
+        from transformers.integrations.hub_kernels import load_and_register_kernel
+
+        flash_attn3_kernel = "kernels-community/vllm-flash-attn3"
+        load_and_register_kernel(flash_attn3_kernel)
+        setattr(config, "_attn_implementation", flash_attn3_kernel)
+        setattr(config, "_attn_implementation_internal", flash_attn3_kernel)
+        model_args.flash_attn = AttentionFunction.FA3
+
+        logger.info_rank0("Using FlashAttention-3 with attention sink for the gpt-oss model.")
+        return
+
+    if getattr(config, "model_type", None) == "gemma2":
+        if model_args.flash_attn == AttentionFunction.AUTO or model_args.flash_attn == AttentionFunction.FA2:
+            if is_flash_attn_2_available():
+                if model_args.flash_attn != AttentionFunction.FA2:
+                    logger.warning_rank0("Gemma 2 should use flash attention 2, change `flash_attn` to fa2.")
+                    model_args.flash_attn = AttentionFunction.FA2
+            else:
+                logger.warning_rank0("FlashAttention-2 is not installed, use eager attention.")
+                model_args.flash_attn = AttentionFunction.DISABLED
+        elif model_args.flash_attn == AttentionFunction.SDPA:
+            logger.warning_rank0(
+                "Gemma-2 should use soft-capping attention, while the SDPA attention does not support it."
+            )
+
+    if getattr(config, "model_type", None) in ["youtu", "youtu_vl"]:
+        if model_args.flash_attn in (AttentionFunction.AUTO, AttentionFunction.SDPA):
+            logger.warning_rank0("Youtu-VL does not support SDPA, forcing eager attention.")
+            model_args.flash_attn = AttentionFunction.DISABLED
+
+    if model_args.flash_attn == AttentionFunction.AUTO:
+        return
+
+    elif model_args.flash_attn == AttentionFunction.DISABLED:
+        requested_attn_implementation = "eager"
+
+    elif model_args.flash_attn == AttentionFunction.SDPA:
+        if not is_torch_version_greater_than("2.1.1"):
+            logger.warning_rank0("torch>=2.1.1 is required for SDPA attention.")
+            return
+
+        requested_attn_implementation = "sdpa"
+    elif model_args.flash_attn == AttentionFunction.FA2:
+        from transformers import is_torch_npu_available
+
+        if not (is_flash_attn_2_available() or is_torch_npu_available()):
+            logger.warning_rank0("FlashAttention-2 is not installed.")
+            return
+
+        requested_attn_implementation = "flash_attention_2"
+    else:
+        raise NotImplementedError(f"Unknown attention type: {model_args.flash_attn}")
+
+    if getattr(config, "model_type", None) == "internlm2":  # special case for custom models
+        setattr(config, "attn_implementation", requested_attn_implementation)
+    elif getattr(config, "model_type", None) == "kimi_vl":
+        setattr(config.vision_config, "_attn_implementation", requested_attn_implementation)
+        setattr(config.text_config, "_attn_implementation", requested_attn_implementation)
+    elif getattr(config, "model_type", None) == "youtu_vl":
+        setattr(config, "attn_implementation", requested_attn_implementation)
+        setattr(config, "_attn_implementation", requested_attn_implementation)
+        if hasattr(config, "vision_config"):
+            setattr(config.vision_config, "_attn_implementation", requested_attn_implementation)
+        if hasattr(config, "text_config"):
+            setattr(config.text_config, "_attn_implementation", requested_attn_implementation)
+    else:
+        setattr(config, "_attn_implementation", requested_attn_implementation)
+
+
+def print_attn_implementation(config: "PretrainedConfig") -> None:
+    if getattr(config, "model_type", None) == "internlm2":  # special case for custom models
+        attn_implementation = getattr(config, "attn_implementation", None)
+    else:
+        attn_implementation = getattr(config, "_attn_implementation", None)
+
+    if attn_implementation == "flash_attention_2":
+        logger.info_rank0("Using FlashAttention-2 for faster training and inference.")
+    elif attn_implementation == "sdpa":
+        logger.info_rank0("Using torch SDPA for faster training and inference.")
+    else:
+        logger.info_rank0("Using vanilla attention implementation.")
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/checkpointing.py b/LlamaFactory/src/llamafactory/model/model_utils/checkpointing.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ba7ec96a74fe7955a4471abe219cd2ffca22f19
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/checkpointing.py
@@ -0,0 +1,184 @@
+# Copyright 2025 HuggingFace Inc., Daniel Han-Chen & the Unsloth team and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's Transformers and PEFT library,
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/modeling_utils.py
+# https://github.com/huggingface/peft/blob/v0.10.0/src/peft/utils/other.py
+# and the Unsloth library.
+# https://github.com/unslothai/unsloth/blob/July-2024/unsloth/models/_utils.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+from collections.abc import Callable
+from functools import WRAPPER_ASSIGNMENTS, partial, wraps
+from types import MethodType
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import torch
+
+from ...extras import logging
+from ...extras.constants import LAYERNORM_NAMES
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def get_unsloth_gradient_checkpointing_func() -> Callable:
+    class UnslothGradientCheckpointing(torch.autograd.Function):
+        r"""Saves VRAM by smartly offloading to RAM."""
+
+        @staticmethod
+        @torch.cuda.amp.custom_fwd
+        def forward(
+            ctx: "torch.autograd.Function",
+            forward_function: "torch.Module",
+            hidden_states: "torch.Tensor",
+            *args: Union["torch.Tensor", Any],
+        ) -> "torch.Tensor":
+            saved_hidden_states = hidden_states.to("cpu", non_blocking=True)
+            with torch.no_grad():
+                outputs = forward_function(hidden_states, *args)
+
+            ctx.save_for_backward(saved_hidden_states)
+            ctx.forward_function = forward_function
+            ctx.args = args
+            return outputs
+
+        @staticmethod
+        @torch.cuda.amp.custom_bwd
+        def backward(ctx: "torch.autograd.Function", grad_output: "torch.Tensor") -> "torch.Tensor":
+            (hidden_states,) = ctx.saved_tensors
+            hidden_states = hidden_states.to("cuda", non_blocking=True).detach()
+            hidden_states.requires_grad_(True)
+            with torch.enable_grad():
+                outputs = ctx.forward_function(hidden_states, *ctx.args)
+                output = outputs[0] if isinstance(outputs, tuple) else outputs
+
+            torch.autograd.backward(output, grad_output)
+            return (None, hidden_states.grad) + (None,) * len(ctx.args)
+
+    return UnslothGradientCheckpointing.apply
+
+
+def get_custom_gradient_checkpointing_func(gradient_checkpointing_func: Callable) -> Callable:
+    r"""Only applies gradient checkpointing to trainable layers."""
+
+    @wraps(gradient_checkpointing_func, assigned=WRAPPER_ASSIGNMENTS + ("__self__",))
+    def custom_gradient_checkpointing_func(func: Callable, *args: Union["torch.Tensor", Any], **kwargs):
+        if isinstance(func, partial):
+            module: torch.nn.Module = func.func.__self__
+        else:
+            module: torch.nn.Module = func.__self__
+
+        has_grad = False
+        if any(param.requires_grad for param in module.parameters()):
+            has_grad = True
+            for arg in args:
+                if torch.is_tensor(arg) and torch.is_floating_point(arg):
+                    arg.requires_grad_(True)
+                    break  # assume the first tensor is always the hidden states
+
+        if has_grad:
+            return gradient_checkpointing_func(func, *args, **kwargs)
+        else:
+            return func(*args, **kwargs)
+
+    return custom_gradient_checkpointing_func
+
+
+def _gradient_checkpointing_enable(
+    self: "PreTrainedModel",
+    gradient_checkpointing_kwargs: Optional[dict[str, Any]] = None,
+    use_unsloth_gc: bool = False,
+) -> None:
+    r"""Activates gradient checkpointing for the current model.
+
+    Modification of the original method to enable gradient checkpointing for block-wise optimizer.
+    """
+    from torch.utils.checkpoint import checkpoint
+
+    if not self.supports_gradient_checkpointing:
+        raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+
+    if gradient_checkpointing_kwargs is None:
+        gradient_checkpointing_kwargs = {"use_reentrant": True}
+
+    if use_unsloth_gc:
+        gradient_checkpointing_func = get_unsloth_gradient_checkpointing_func()
+    else:
+        gradient_checkpointing_func = partial(checkpoint, **gradient_checkpointing_kwargs)
+
+    gradient_checkpointing_func = get_custom_gradient_checkpointing_func(gradient_checkpointing_func)
+    if "value" in inspect.signature(self._set_gradient_checkpointing).parameters:  # old GC format
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+        self.enable_input_require_grads()
+        logger.warning_rank0_once("You are using the old GC format, some features (e.g. BAdam) will be invalid.")
+    else:  # have already enabled input require gradients
+        self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func)
+
+
+def _fp32_forward_post_hook(
+    module: "torch.nn.Module", args: tuple["torch.Tensor"], output: "torch.Tensor"
+) -> "torch.Tensor":
+    return output.to(torch.float32)
+
+
+def prepare_model_for_training(model: "PreTrainedModel", model_args: "ModelArguments") -> None:
+    r"""Prepare the model before training.
+
+    Include:
+    (1) cast the layernorm in fp32
+    (2) make output embedding layer require grads
+    (3) add the upcasting of the lm_head in fp32.
+    """
+    if model_args.upcast_layernorm:
+        logger.info_rank0("Upcasting layernorm weights in float32.")
+        for name, param in model.named_parameters():
+            if param.ndim == 1 and any(ln_name in name for ln_name in LAYERNORM_NAMES):
+                param.data = param.data.to(torch.float32)
+
+    if (
+        os.environ.get("ACCELERATE_USE_FSDP", "false").lower() == "true"
+        and int(os.environ.get("FSDP_VERSION", "1")) == 2
+    ):
+        model_args.use_reentrant_gc = False
+        logger.warning_rank0("You are using fsdp2, `use_reentrant_gc` has been set to False.")
+
+    if not model_args.disable_gradient_checkpointing:
+        if not getattr(model, "supports_gradient_checkpointing", False):
+            logger.warning_rank0("Current model does not support gradient checkpointing.")
+        else:
+            # use_reentrant=False might increase VRAM usage (have not been empirically verified yet)
+            # According to: https://github.com/huggingface/transformers/issues/28339
+            gradient_checkpointing_enable = partial(
+                _gradient_checkpointing_enable, use_unsloth_gc=model_args.use_unsloth_gc
+            )
+            model.gradient_checkpointing_enable = MethodType(gradient_checkpointing_enable, model)
+            model.gradient_checkpointing_enable(
+                gradient_checkpointing_kwargs={"use_reentrant": model_args.use_reentrant_gc}
+            )
+            setattr(model.config, "use_cache", False)  # turn off when gradient checkpointing is enabled
+            logger.info_rank0("Gradient checkpointing enabled.")
+
+    if model_args.upcast_lmhead_output:
+        output_layer = model.get_output_embeddings()
+        if isinstance(output_layer, torch.nn.Linear) and output_layer.weight.dtype != torch.float32:
+            logger.info_rank0("Upcasting lm_head outputs in float32.")
+            output_layer.register_forward_hook(_fp32_forward_post_hook)
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/embedding.py b/LlamaFactory/src/llamafactory/model/model_utils/embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..b503f3b97d137a19ebe6338c8f4ec9d794e86efa
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/embedding.py
@@ -0,0 +1,218 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from contextlib import nullcontext
+from typing import TYPE_CHECKING, Optional
+
+import torch
+from transformers.integrations import is_deepspeed_zero3_enabled
+
+from ...extras import logging
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, PreTrainedTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+
+def _noisy_mean_initialization(embed_weight: "torch.Tensor", num_new_tokens: int) -> None:
+    """Initialize new token embeddings with mean + Gaussian noise.
+
+    This is the default initialization method used by LlamaFactory.
+
+    Args:
+        embed_weight: The embedding weight matrix to initialize (shape: [vocab_size, embedding_dim])
+        num_new_tokens: Number of new tokens added at the end of the embedding matrix
+    """
+    embedding_dim = embed_weight.size(1)
+    avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True)
+    noise_weight = torch.empty_like(embed_weight[-num_new_tokens:])
+    noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim)))
+    embed_weight[-num_new_tokens:] = avg_weight + noise_weight
+
+
+def _description_based_initialization(
+    embed_weight: "torch.Tensor",
+    num_new_tokens: int,
+    descriptions: dict[str, str],
+    tokenizer: "PreTrainedTokenizer",
+    model: "PreTrainedModel",
+    add_noise: bool = False,
+) -> None:
+    """Initialize new token embeddings based on textual descriptions.
+
+    For each new token, this function:
+    1. Tokenizes its description text
+    2. Gets embeddings of the description tokens
+    3. Averages them to initialize the new token's embedding
+    4. Optionally adds Gaussian noise
+
+    Args:
+        embed_weight: The embedding weight matrix to initialize (shape: [vocab_size, embedding_dim])
+        num_new_tokens: Number of new tokens added
+        descriptions: Dict mapping token string to its description text
+                      e.g., {"<think>": "A token representing reasoning process"}
+        tokenizer: The tokenizer instance
+        model: The model instance (used to get input embeddings)
+        add_noise: Whether to add Gaussian noise to the initialization
+
+    Example:
+        descriptions = {
+            "<|START_OF_SVG|>": "Marks the beginning of an SVG document",
+            "<|END_OF_SVG|>": "Marks the end of an SVG document"
+        }
+    """
+    embedding_dim = embed_weight.size(1)
+
+    for i, desc in enumerate(descriptions.values()):
+        # Tokenize description text
+        tokens = tokenizer(desc, return_tensors="pt", add_special_tokens=False)
+
+        with torch.no_grad():
+            token_ids = tokens["input_ids"][0]
+            # Move to the same device as embed_weight
+            device = embed_weight.device
+            token_ids = token_ids.to(device)
+
+            # Filter out new tokens (they don't have valid embeddings yet)
+            valid_token_ids = token_ids[token_ids < (len(tokenizer) - num_new_tokens)]
+
+            if len(valid_token_ids) == 0:
+                # Fallback: use mean of all existing embeddings
+                logger.warning_rank0(
+                    f"Description for token {i + 1}/{num_new_tokens} contains no valid tokens. "
+                    "Using mean of existing embeddings."
+                )
+                base_embedding = embed_weight[:-num_new_tokens].mean(dim=0)
+            else:
+                # Get embeddings of description tokens and average them
+                token_embeds = model.get_input_embeddings()(valid_token_ids)
+                base_embedding = token_embeds.mean(dim=0)
+
+            # Add noise if requested (ensure correct device and dtype)
+            if add_noise:
+                noise = torch.randn_like(base_embedding) * (1.0 / math.sqrt(embedding_dim))
+                embed_weight[-num_new_tokens + i] = base_embedding + noise
+            else:
+                embed_weight[-num_new_tokens + i] = base_embedding
+
+
+def _initialize_embeddings(
+    embed_weight: "torch.Tensor",
+    num_new_tokens: int,
+    init_method: str,
+    new_special_tokens_config: Optional[dict],
+    tokenizer: "PreTrainedTokenizer",
+    model: "PreTrainedModel",
+) -> None:
+    """Single source of truth for embedding initialization.
+
+    This function selects the appropriate initialization method and applies it.
+
+    Args:
+        embed_weight: The embedding weight matrix to initialize
+        num_new_tokens: Number of new tokens added
+        init_method: Initialization method ('noise_init', 'desc_init', 'desc_init_w_noise')
+        new_special_tokens_config: Config dict with token descriptions (required for desc_init methods)
+        tokenizer: The tokenizer instance
+        model: The model instance
+    """
+    if init_method == "desc_init" and new_special_tokens_config:
+        logger.info_rank0("Using semantic initialization (desc_init) for new special tokens")
+        _description_based_initialization(
+            embed_weight, num_new_tokens, new_special_tokens_config, tokenizer, model, add_noise=False
+        )
+    elif init_method == "desc_init_w_noise" and new_special_tokens_config:
+        logger.info_rank0("Using semantic initialization with noise (desc_init_w_noise) for new special tokens")
+        _description_based_initialization(
+            embed_weight, num_new_tokens, new_special_tokens_config, tokenizer, model, add_noise=True
+        )
+    else:
+        if init_method != "noise_init":
+            logger.warning_rank0(
+                f"init_method='{init_method}' requires descriptions config, falling back to 'noise_init'"
+            )
+        logger.info_rank0("Using noisy mean initialization (noise_init) for new special tokens")
+        _noisy_mean_initialization(embed_weight, num_new_tokens)
+
+
+def resize_embedding_layer(
+    model: "PreTrainedModel",
+    tokenizer: "PreTrainedTokenizer",
+    new_special_tokens_config: Optional[dict] = None,
+    init_special_tokens: str = "noise_init",
+) -> None:
+    r"""Resize token embeddings and initialize new tokens.
+
+    Args:
+        model: The model to resize
+        tokenizer: The tokenizer (used to get target vocab size)
+        new_special_tokens_config: Optional dict with token descriptions for semantic initialization
+        init_special_tokens: Initialization method ('noise_init', 'desc_init', 'desc_init_w_noise')
+    """
+    if is_deepspeed_zero3_enabled():
+        import deepspeed  # type: ignore
+
+        params = [model.get_input_embeddings().weight]
+        if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings:
+            params.append(model.get_output_embeddings().weight)
+
+        context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0)
+    else:
+        context_maybe_zero3 = nullcontext()
+
+    with context_maybe_zero3:
+        current_embedding_size = model.get_input_embeddings().weight.size(0)
+
+    if len(tokenizer) > current_embedding_size:
+        if getattr(model, "quantization_method", None):
+            raise ValueError("Cannot resize embedding layers of a quantized model.")
+
+        if not isinstance(model.get_output_embeddings(), torch.nn.Linear):
+            raise ValueError("Current model does not support resizing embedding layers.")
+
+        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)
+        with context_maybe_zero3:
+            new_embedding_size = model.get_input_embeddings().weight.size(0)
+            num_new_tokens = new_embedding_size - current_embedding_size
+            logger.info_rank0(
+                f"Resizing embeddings: {current_embedding_size} -> {new_embedding_size} (+{num_new_tokens} tokens)"
+            )
+
+            # Initialize input embeddings
+            _initialize_embeddings(
+                model.get_input_embeddings().weight.data,
+                num_new_tokens,
+                init_special_tokens,
+                new_special_tokens_config,
+                tokenizer,
+                model,
+            )
+
+            # Initialize output embeddings if not tied
+            if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings:
+                _initialize_embeddings(
+                    model.get_output_embeddings().weight.data,
+                    num_new_tokens,
+                    init_special_tokens,
+                    new_special_tokens_config,
+                    tokenizer,
+                    model,
+                )
+
+        model.config.vocab_size = new_embedding_size
+        logger.info_rank0(f"Resized token embeddings from {current_embedding_size} to {new_embedding_size}.")
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/ktransformers.py b/LlamaFactory/src/llamafactory/model/model_utils/ktransformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..26c413cabe0333185714126e45a2edc732b37720
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/ktransformers.py
@@ -0,0 +1,154 @@
+# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.util as _u
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+from ...extras import logging
+from ...extras.misc import get_current_device
+
+
+if TYPE_CHECKING:
+    from ...hparams import FinetuningArguments, ModelArguments
+
+from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
+
+
+KT_AVAILABLE = _u.find_spec("ktransformers") is not None
+if KT_AVAILABLE:
+    from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
+    from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
+    from ktransformers.models.modeling_llama import LlamaForCausalLM
+    from ktransformers.models.modeling_mixtral import MixtralForCausalLM
+    from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
+    from ktransformers.models.modeling_qwen3_moe import Qwen3MoeForCausalLM
+    from ktransformers.optimize.optimize import optimize_and_load_gguf
+    from ktransformers.server.config.config import Config
+    from ktransformers.sft.lora import inject_lora_layer
+    from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader
+    from ktransformers.util.globals import GLOBAL_CONFIG
+    from ktransformers.util.utils import load_weights
+
+logger = logging.get_logger(__name__)
+
+
+def _get_kt_kwargs(
+    config: "PretrainedConfig",
+    model_name_or_path: str,
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+) -> dict[str, Any]:
+    return {
+        "model_name": model_name_or_path,
+        "max_seq_length": model_args.model_max_length or 4096,
+        "dtype": model_args.compute_dtype,
+        "load_in_4bit": model_args.quantization_bit == 4,
+        "token": model_args.hf_hub_token,
+        "full_finetuning": finetuning_args.finetuning_type == "full",
+        "device_map": {"": get_current_device()},
+        "rope_scaling": getattr(config, "rope_scaling", None),
+        "fix_tokenizer": False,
+        "trust_remote_code": model_args.trust_remote_code,
+        "use_gradient_checkpointing": "ktransformers",
+    }
+
+
+def load_kt_pretrained_model(config: "PretrainedConfig", model_args: "ModelArguments") -> "PreTrainedModel":
+    r"""Optionally load pretrained model with KTransformers. Used in training."""
+    custom_models = {
+        "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
+        "DeepseekV3ForCausalLM": DeepseekV3ForCausalLM,
+        "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
+        "Qwen3MoeForCausalLM": Qwen3MoeForCausalLM,
+        "LlamaForCausalLM": LlamaForCausalLM,
+        "MixtralForCausalLM": MixtralForCausalLM,
+    }
+    Config().cpu_infer = model_args.cpu_infer
+    Config().chunk_size = model_args.chunk_size
+    config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code)
+
+    if model_args.mode == "long_context":
+        assert config.architectures[0] == "LlamaForCausalLM", "only LlamaForCausalLM support long_context mode"
+        torch.set_default_dtype(torch.float16)
+    else:
+        torch.set_default_dtype(config.torch_dtype)
+
+    with torch.device("meta"):
+        if config.architectures[0] in custom_models:
+            print("using custom modeling_xxx.py.")
+            if "Qwen2Moe" in config.architectures[0]:  # Qwen2Moe must use flash_attention_2 to avoid overflow.
+                config._attn_implementation = "flash_attention_2"
+            if "Llama" in config.architectures[0]:
+                config._attn_implementation = "eager"
+            if "Mixtral" in config.architectures[0]:
+                config._attn_implementation = "flash_attention_2"
+            model = custom_models[config.architectures[0]](config)
+        else:
+            attn_implementation = "flash_attention_2"
+            model = AutoModelForCausalLM.from_config(
+                config, trust_remote_code=True, attn_implementation=attn_implementation
+            )
+
+    optimize_config_path = model_args.kt_optimize_rule
+    gguf_path = model_args.model_name_or_path
+
+    assert optimize_config_path is not None, "optimize_config_path must be provided (path to YAML rules file)."
+    assert gguf_path is not None, "gguf_path must be provided (path to a folder or .gguf file)."
+
+    GLOBAL_CONFIG._config["mod"] = "infer"
+    optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
+
+    return model
+
+
+def get_kt_peft_model(model: "PreTrainedModel", peft_kwargs: dict[str, Any]) -> "PreTrainedModel":
+    r"""Get the peft model for the pretrained model with KTransformers. Used in training."""
+    from ktransformers.sft.peft_utils.mapping import get_peft_model
+
+    return get_peft_model(model, peft_kwargs)
+
+
+def load_kt_peft_model(model_args: "ModelArguments", model: "PreTrainedModel") -> "PreTrainedModel":
+    r"""Load peft model with KTransformers. Used in both training and inference."""
+    load_adapter_name_or_path = model_args.adapter_name_or_path[0]
+    if load_adapter_name_or_path.endswith(".gguf"):
+        inject_lora_layer(model, load_adapter_name_or_path)
+        adapter_gguf_loader = GGUFLoader(load_adapter_name_or_path)
+        load_weights(model, adapter_gguf_loader, adapter_gguf=True)
+        model.train()
+    else:
+        inject_lora_layer(model, load_adapter_name_or_path)
+
+        adapter_loader = SafeTensorLoader(load_adapter_name_or_path)
+        device = next(model.parameters()).device
+        for key in adapter_loader.tensor_file_map.keys():
+            try:
+                tensor = adapter_loader.load_tensor(key, device=device)
+
+                model_key = key.replace("base_model.model.", "")
+                model_key = model_key.replace(".weight", ".default.weight")
+                model_key = model_key.replace(".default.default.weight", ".default.weight")
+
+                param = model.get_parameter(model_key)
+                param.data.copy_(tensor.data)
+
+                print(f"Loaded adapter weight: {key} -> {model_key}")
+            except AttributeError:
+                print(f"Skipping {key}: not a model parameter")
+            except KeyError:
+                print(f"Key not found in model: {model_key} (original: {key})")
+
+    return model
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/kv_cache.py b/LlamaFactory/src/llamafactory/model/model_utils/kv_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f622f73f30017868f55e8503a968593c309b204
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/kv_cache.py
@@ -0,0 +1,44 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...extras import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+    from ...hparams import ModelArguments
+
+
+def configure_kv_cache(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
+    if not is_trainable:
+        setattr(config, "use_cache", model_args.use_kv_cache)
+        if hasattr(config, "text_config"):
+            setattr(config.text_config, "use_cache", model_args.use_kv_cache)
+
+        if model_args.use_kv_cache:
+            logger.info_rank0("KV cache is enabled for faster generation.")
+        else:
+            logger.info_rank0("KV cache is disabled.")
+    else:
+        setattr(config, "use_cache", False)
+        if hasattr(config, "text_config"):
+            setattr(config.text_config, "use_cache", False)
+
+        logger.info_rank0("KV cache is disabled during training.")
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/liger_kernel.py b/LlamaFactory/src/llamafactory/model/model_utils/liger_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8f0e842e8b9f2a833f525ce1cdc155d2fbc6bc0
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/liger_kernel.py
@@ -0,0 +1,97 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import TYPE_CHECKING
+
+from ...extras import logging
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def apply_liger_kernel(
+    config: "PretrainedConfig",
+    model_args: "ModelArguments",
+    is_trainable: bool,
+    require_logits: bool,
+) -> None:
+    if not is_trainable or not model_args.enable_liger_kernel:
+        return
+
+    model_type = getattr(config, "model_type", None)
+    if model_type == "gemma":
+        from liger_kernel.transformers import apply_liger_kernel_to_gemma as apply_liger_kernel
+    elif model_type == "gemma2":
+        from liger_kernel.transformers import apply_liger_kernel_to_gemma2 as apply_liger_kernel
+    elif model_type == "gemma3":
+        from liger_kernel.transformers import apply_liger_kernel_to_gemma3 as apply_liger_kernel
+    elif model_type == "gemma3_text":
+        from liger_kernel.transformers import apply_liger_kernel_to_gemma3_text as apply_liger_kernel
+    elif model_type == "glm4":
+        from liger_kernel.transformers import apply_liger_kernel_to_glm4 as apply_liger_kernel
+    elif model_type == "glm4v":
+        from liger_kernel.transformers import apply_liger_kernel_to_glm4v as apply_liger_kernel
+    elif model_type == "granite":
+        from liger_kernel.transformers import apply_liger_kernel_to_granite as apply_liger_kernel
+    elif model_type == "llama":
+        from liger_kernel.transformers import apply_liger_kernel_to_llama as apply_liger_kernel
+    elif model_type == "llava":
+        from liger_kernel.transformers import apply_liger_kernel_to_llava as apply_liger_kernel
+    elif model_type == "mistral":
+        from liger_kernel.transformers import apply_liger_kernel_to_mistral as apply_liger_kernel
+    elif model_type == "mixtral":
+        from liger_kernel.transformers import apply_liger_kernel_to_mixtral as apply_liger_kernel
+    elif model_type == "mllama":
+        from liger_kernel.transformers import apply_liger_kernel_to_mllama as apply_liger_kernel
+    elif model_type == "olmo2":
+        from liger_kernel.transformers import apply_liger_kernel_to_olmo2 as apply_liger_kernel
+    elif model_type == "paligemma":
+        from liger_kernel.transformers import apply_liger_kernel_to_paligemma as apply_liger_kernel
+    elif model_type == "phi3":
+        from liger_kernel.transformers import apply_liger_kernel_to_phi3 as apply_liger_kernel
+    elif model_type == "qwen2":
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen2 as apply_liger_kernel
+    elif model_type == "qwen2_vl":
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl as apply_liger_kernel
+    elif model_type == "qwen2_5_vl":
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen2_5_vl as apply_liger_kernel
+    elif model_type == "qwen3":
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen3 as apply_liger_kernel
+    elif model_type == "qwen3_moe":
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen3_moe as apply_liger_kernel
+    elif model_type == "gpt_oss":
+        try:
+            from liger_kernel.transformers import apply_liger_kernel_to_gpt_oss as apply_liger_kernel
+        except ImportError:
+            logger.warning_rank0("Please install liger-kernel from https://github.com/Comet0322/Liger-Kernel.")
+            return
+    else:
+        logger.warning_rank0("Current model does not support liger kernel.")
+        return
+
+    if require_logits and "fused_linear_cross_entropy" in inspect.signature(apply_liger_kernel).parameters:
+        logger.info_rank0("Current training stage does not support chunked cross entropy.")
+        kwargs = {"fused_linear_cross_entropy": False, "cross_entropy": True}
+    else:
+        kwargs = {}
+
+    apply_liger_kernel(**kwargs)
+    logger.info_rank0("Liger kernel has been applied to the model.")
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/longlora.py b/LlamaFactory/src/llamafactory/model/model_utils/longlora.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7c36ee0cb89959fa692d57cc9f85bb162ca58a0
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/longlora.py
@@ -0,0 +1,370 @@
+# Copyright 2025 EleutherAI, HuggingFace Inc., Yukang Chen, and the LlamaFactory team.
+#
+# This code is based on the EleutherAI's GPT-NeoX and the HuggingFace's Transformers libraries.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+# This code is also inspired by the original LongLoRA implementation.
+# https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import TYPE_CHECKING, Optional
+
+import torch
+import torch.nn as nn
+import transformers
+
+from ...extras import logging
+from ...extras.constants import SUPPORTED_CLASS_FOR_S2ATTN
+from ...extras.misc import check_version
+from ...extras.packages import is_transformers_version_greater_than
+
+
+if not is_transformers_version_greater_than("4.48.0"):
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+    from transformers.models.llama.modeling_llama import (
+        Cache,
+        LlamaAttention,
+        LlamaFlashAttention2,
+        LlamaSdpaAttention,
+        apply_rotary_pos_emb,
+        repeat_kv,
+    )
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+    from ...hparams import ModelArguments
+
+
+transformers_logger = transformers.utils.logging.get_logger(__name__)
+
+
+# Modified from:
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+def llama_attention_forward(
+    self: "LlamaAttention",
+    hidden_states: "torch.Tensor",
+    attention_mask: Optional["torch.Tensor"] = None,
+    position_ids: Optional["torch.LongTensor"] = None,
+    past_key_value: Optional["Cache"] = None,
+    output_attentions: bool = False,
+    cache_position: Optional["torch.LongTensor"] = None,
+    position_embeddings: Optional[tuple["torch.Tensor", "torch.Tensor"]] = None,
+    **kwargs,
+) -> tuple["torch.Tensor", Optional["torch.Tensor"], Optional[tuple["torch.Tensor"]]]:
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states: torch.Tensor = self.q_proj(hidden_states)
+    key_states: torch.Tensor = self.k_proj(hidden_states)
+    value_states: torch.Tensor = self.v_proj(hidden_states)
+
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+    if position_embeddings is None:
+        cos, sin = self.rotary_emb(value_states, position_ids)
+    else:
+        cos, sin = position_embeddings
+
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift
+        groupsz = int(q_len * getattr(self.config, "group_size_ratio"))
+        assert q_len % groupsz == 0, f"q_len {q_len} should be divisible by group size {groupsz}."
+        num_groups = q_len // groupsz
+
+        def shift(state: "torch.Tensor") -> "torch.Tensor":
+            state = state.transpose(1, 2)  # output: (bsz, seq_len, n_heads, head_dim)
+            state = torch.cat(
+                (state[:, :, : self.num_heads // 2], state[:, :, self.num_heads // 2 :].roll(-groupsz // 2, dims=1)),
+                dim=2,
+            )
+            return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states)
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, :, :groupsz, :groupsz].repeat(num_groups, 1, 1, 1)
+
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+    attn_output = torch.matmul(attn_weights, value_states)  # (bsz, :, seq_len, :) or (bsz * n_group, :, groupsz, :)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift back
+        attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim)
+        attn_output = torch.cat(
+            (
+                attn_output[:, :, : self.num_heads // 2],
+                attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
+            ),
+            dim=2,
+        )
+
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = self.o_proj(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
+
+
+# Modified from:
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+def llama_flash_attention_2_forward(
+    self: "LlamaFlashAttention2",
+    hidden_states: "torch.Tensor",
+    attention_mask: Optional["torch.Tensor"] = None,
+    position_ids: Optional["torch.LongTensor"] = None,
+    past_key_value: Optional["Cache"] = None,
+    output_attentions: bool = False,
+    cache_position: Optional["torch.LongTensor"] = None,
+    position_embeddings: Optional[tuple["torch.Tensor", "torch.Tensor"]] = None,
+    **kwargs,
+) -> tuple["torch.Tensor", Optional["torch.Tensor"], Optional[tuple["torch.Tensor"]]]:
+    # LlamaFlashAttention2 attention does not support output_attentions
+    output_attentions = False
+
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states: torch.Tensor = self.q_proj(hidden_states)
+    key_states: torch.Tensor = self.k_proj(hidden_states)
+    value_states: torch.Tensor = self.v_proj(hidden_states)
+
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+    if position_embeddings is None:
+        cos, sin = self.rotary_emb(value_states, position_ids)
+    else:
+        cos, sin = position_embeddings
+
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    # FlashAttention requires the input to have the shape (bsz, seq_len, n_heads, head_dim)
+    query_states = query_states.transpose(1, 2)
+    key_states = key_states.transpose(1, 2)
+    value_states = value_states.transpose(1, 2)
+
+    dropout_rate = self.attention_dropout if self.training else 0.0
+
+    input_dtype = query_states.dtype
+    if input_dtype == torch.float32:
+        if torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        elif hasattr(self.config, "_pre_quantization_dtype"):
+            target_dtype = self.config._pre_quantization_dtype
+        else:
+            target_dtype = self.q_proj.weight.dtype
+
+        transformers_logger.warning_once("The input hidden states seems to be silently casted in float32.")
+        query_states = query_states.to(target_dtype)
+        key_states = key_states.to(target_dtype)
+        value_states = value_states.to(target_dtype)
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift
+        groupsz = int(q_len * getattr(self.config, "group_size_ratio"))
+        assert q_len % groupsz == 0, f"q_len {q_len} should be divisible by group size {groupsz}."
+        num_groups = q_len // groupsz
+
+        def shift(state: "torch.Tensor") -> "torch.Tensor":
+            state = torch.cat(
+                (state[:, :, : self.num_heads // 2], state[:, :, self.num_heads // 2 :].roll(-groupsz // 2, dims=1)),
+                dim=2,
+            )
+            return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim)
+
+        query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states)
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, :groupsz].repeat(num_groups, 1)
+
+        attn_output: torch.Tensor = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            query_states.size(1),
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift back
+        attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim)
+        attn_output = torch.cat(
+            (
+                attn_output[:, :, : self.num_heads // 2],
+                attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
+            ),
+            dim=2,
+        )
+
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+    attn_output = self.o_proj(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
+
+
+# Modified from:
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+def llama_sdpa_attention_forward(
+    self: "LlamaSdpaAttention",
+    hidden_states: "torch.Tensor",
+    attention_mask: Optional["torch.Tensor"] = None,
+    position_ids: Optional["torch.LongTensor"] = None,
+    past_key_value: Optional["Cache"] = None,
+    output_attentions: bool = False,
+    cache_position: Optional["torch.LongTensor"] = None,
+    position_embeddings: Optional[tuple["torch.Tensor", "torch.Tensor"]] = None,
+    **kwargs,
+) -> tuple["torch.Tensor", Optional["torch.Tensor"], Optional[tuple["torch.Tensor"]]]:
+    if output_attentions:
+        transformers_logger.warning_once(
+            "SDPA does not support `output_attentions=True`. Falling back to the vanilla attention"
+        )
+        return llama_attention_forward(
+            self,
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states: torch.Tensor = self.q_proj(hidden_states)
+    key_states: torch.Tensor = self.k_proj(hidden_states)
+    value_states: torch.Tensor = self.v_proj(hidden_states)
+
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+    if position_embeddings is None:
+        cos, sin = self.rotary_emb(value_states, position_ids)
+    else:
+        cos, sin = position_embeddings
+
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift
+        groupsz = int(q_len * getattr(self.config, "group_size_ratio"))
+        assert q_len % groupsz == 0, f"q_len {q_len} should be divisible by group size {groupsz}."
+        num_groups = q_len // groupsz
+
+        def shift(state: "torch.Tensor") -> "torch.Tensor":
+            state = state.transpose(1, 2)  # output: (bsz, seq_len, n_heads, head_dim)
+            state = torch.cat(
+                (state[:, :, : self.num_heads // 2], state[:, :, self.num_heads // 2 :].roll(-groupsz // 2, dims=1)),
+                dim=2,
+            )
+            return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states)
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, :, :groupsz, :groupsz].repeat(num_groups, 1, 1, 1)
+
+    causal_mask = attention_mask
+    if attention_mask is not None:
+        causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+    if query_states.device.type == "cuda" and causal_mask is not None:  # avoid pytorch bug
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
+
+    is_causal = True if causal_mask is None and q_len > 1 else False
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query_states,
+        key_states,
+        value_states,
+        attn_mask=causal_mask,
+        dropout_p=self.attention_dropout if self.training else 0.0,
+        is_causal=is_causal,
+    )
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift back
+        attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim)
+        attn_output = torch.cat(
+            (
+                attn_output[:, :, : self.num_heads // 2],
+                attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
+            ),
+            dim=2,
+        )
+
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = self.o_proj(attn_output)
+
+    return attn_output, None, past_key_value
+
+
+def _apply_llama_patch() -> None:
+    check_version("transformers>=4.45.0,<4.48.0", mandatory=True)
+    LlamaAttention.forward = llama_attention_forward
+    LlamaFlashAttention2.forward = llama_flash_attention_2_forward
+    LlamaSdpaAttention.forward = llama_sdpa_attention_forward
+
+
+def configure_longlora(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
+    if not is_trainable or not model_args.shift_attn:
+        return
+
+    logger = logging.get_logger(__name__)
+
+    if getattr(config, "model_type", None) in SUPPORTED_CLASS_FOR_S2ATTN:
+        setattr(config, "group_size_ratio", 0.25)
+        _apply_llama_patch()
+        logger.info_rank0("Using shift short attention with group_size_ratio=1/4.")
+    else:
+        logger.warning_rank0("Current model does not support shift short attention.")
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/misc.py b/LlamaFactory/src/llamafactory/model/model_utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0249b47c0740c16e2b42a3e8c9f9f47691412fe
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/misc.py
@@ -0,0 +1,86 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...extras import logging
+from .visual import COMPOSITE_MODELS
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+
+def find_all_linear_modules(model: "PreTrainedModel", freeze_vision_tower: bool) -> list[str]:
+    r"""Find all available modules to apply LoRA, GaLore or APOLLO."""
+    model_type = getattr(model.config, "model_type", None)
+    forbidden_modules = {"lm_head"}
+    if model_type == "chatglm":
+        forbidden_modules.add("output_layer")
+    elif model_type == "internlm2":
+        forbidden_modules.add("output")
+
+    if model_type in COMPOSITE_MODELS:
+        forbidden_modules.add(COMPOSITE_MODELS[model_type].projector_key)
+
+    if freeze_vision_tower and model_type in COMPOSITE_MODELS:
+        forbidden_modules.update(COMPOSITE_MODELS[model_type].vision_model_keys)
+
+    module_names = set()
+    for name, module in model.named_modules():
+        if any(forbidden_module in name for forbidden_module in forbidden_modules):
+            continue
+
+        if "Linear" in module.__class__.__name__ and "Embedding" not in module.__class__.__name__:
+            module_names.add(name.split(".")[-1])
+
+    logger.info_rank0("Found linear modules: {}".format(",".join(module_names)))
+    return list(module_names)
+
+
+def find_expanded_modules(model: "PreTrainedModel", target_modules: list[str], num_layer_trainable: int) -> list[str]:
+    r"""Find the modules in the expanded blocks to apply lora."""
+    num_layers = getattr(model.config, "num_hidden_layers", None)
+    if not num_layers:
+        raise ValueError("Model was not supported.")
+
+    if num_layers % num_layer_trainable != 0:
+        raise ValueError(
+            f"`num_layers` {num_layers} should be divisible by `num_layer_trainable` {num_layer_trainable}."
+        )
+
+    stride = num_layers // num_layer_trainable
+    trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride)
+    trainable_layers = [f".{idx:d}." for idx in trainable_layer_ids]
+    module_names = []
+    for name, _ in model.named_modules():
+        if any(target_module in name for target_module in target_modules) and any(
+            trainable_layer in name for trainable_layer in trainable_layers
+        ):
+            module_names.append(name)
+
+    logger.info_rank0("Apply lora to layers: {}.".format(",".join(map(str, trainable_layer_ids))))
+    return module_names
+
+
+def register_autoclass(config: "PretrainedConfig", model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer"):
+    if "AutoConfig" in getattr(config, "auto_map", {}):
+        config.__class__.register_for_auto_class()
+    if "AutoModelForCausalLM" in getattr(config, "auto_map", {}):
+        model.__class__.register_for_auto_class()
+    if "AutoTokenizer" in tokenizer.init_kwargs.get("auto_map", {}):
+        tokenizer.__class__.register_for_auto_class()
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/mod.py b/LlamaFactory/src/llamafactory/model/model_utils/mod.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f67cd50dbc9016fc2cd650ae496501e8c594e76
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/mod.py
@@ -0,0 +1,42 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...extras.constants import MOD_SUPPORTED_MODELS
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel
+
+    from ...hparams import ModelArguments
+
+
+def load_mod_pretrained_model(**init_kwargs) -> "PreTrainedModel":
+    from MoD import AutoMoDModelForCausalLM
+
+    return AutoMoDModelForCausalLM.from_pretrained(**init_kwargs)
+
+
+def convert_pretrained_model_to_mod(
+    model: "PreTrainedModel", config: "PretrainedConfig", model_args: "ModelArguments"
+) -> "PreTrainedModel":
+    from MoD import apply_mod_to_hf
+
+    if getattr(config, "model_type", None) not in MOD_SUPPORTED_MODELS:
+        raise ValueError("Current model is not supported by mixture-of-depth.")
+
+    model = apply_mod_to_hf(model)
+    model = model.to(model_args.compute_dtype)
+    return model
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/moe.py b/LlamaFactory/src/llamafactory/model/model_utils/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..250c38c70cb54039f5e7989fbf9424fa7d695943
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/moe.py
@@ -0,0 +1,252 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Union
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from transformers.integrations import is_deepspeed_zero3_enabled
+
+from ...extras.misc import check_version
+from ...extras.packages import is_transformers_version_greater_than
+
+
+if TYPE_CHECKING:
+    from torch import nn
+    from transformers import PretrainedConfig, PreTrainedModel
+
+    from ...hparams import ModelArguments
+
+if is_transformers_version_greater_than("4.57.0"):
+    from transformers.models.qwen3_omni_moe import modeling_qwen3_omni_moe
+
+
+def _set_z3_leaf_modules(model: "PreTrainedModel", leaf_modules: list[Union["nn.Module", str]]) -> None:
+    check_version("deepspeed>=0.13.0")
+    from deepspeed.utils import set_z3_leaf_modules  # type: ignore
+
+    set_z3_leaf_modules(model, leaf_modules)
+
+
+def add_z3_leaf_module(model: "PreTrainedModel") -> None:
+    r"""Set module as a leaf module to skip partitioning in deepspeed zero3."""
+    if not is_deepspeed_zero3_enabled():
+        return
+
+    model_type = getattr(model.config, "model_type", None)
+    text_config = getattr(model.config, "text_config", None)
+    text_model_type = getattr(text_config, "model_type", None)
+
+    if model_type == "dbrx":
+        from transformers.models.dbrx.modeling_dbrx import DbrxFFN
+
+        _set_z3_leaf_modules(model, [DbrxFFN])
+
+    if model_type == "deepseek_v2":
+        # deepseek v2 uses custom code
+        _set_z3_leaf_modules(model, ["DeepseekV2MoE"])
+
+    if model_type == "deepseek_v3" or model_type == "kimi_vl":
+        # deepseek v3 and kimi vl use custom code
+        _set_z3_leaf_modules(model, ["DeepseekV3MoE"])
+
+    if model_type == "ernie4_5_moe":
+        from transformers.models.ernie4_5_moe.modeling_ernie4_5_moe import Ernie4_5_MoeSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [Ernie4_5_MoeSparseMoeBlock])
+
+    if model_type == "granitemoe":
+        from transformers.models.granitemoe.modeling_granitemoe import GraniteMoeMoE
+
+        _set_z3_leaf_modules(model, [GraniteMoeMoE])
+
+    if model_type == "glm4_moe":
+        from transformers.models.glm4_moe.modeling_glm4_moe import Glm4MoeMoE
+
+        _set_z3_leaf_modules(model, [Glm4MoeMoE])
+
+    if model_type == "glm4v_moe":
+        from transformers.models.glm4v_moe.modeling_glm4v_moe import Glm4vMoeTextMoE
+
+        _set_z3_leaf_modules(model, [Glm4vMoeTextMoE])
+
+    if model_type == "gpt_oss":
+        from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP
+
+        _set_z3_leaf_modules(model, [GptOssMLP])
+
+    if model_type == "jamba":
+        from transformers.models.jamba.modeling_jamba import JambaSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [JambaSparseMoeBlock])
+
+    if model_type == "jetmoe":
+        from transformers.models.jetmoe.modeling_jetmoe import JetMoeMoA, JetMoeMoE
+
+        _set_z3_leaf_modules(model, [JetMoeMoA, JetMoeMoE])
+
+    if model_type == "llama4":
+        from transformers.models.llama4.modeling_llama4 import Llama4TextMoe
+
+        _set_z3_leaf_modules(model, [Llama4TextMoe])
+
+    if model_type == "mixtral":
+        from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
+
+    if model_type == "olmoe":
+        from transformers.models.olmoe.modeling_olmoe import OlmoeSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [OlmoeSparseMoeBlock])
+
+    if model_type == "phimoe":
+        from transformers.models.phimoe.modeling_phimoe import PhimoeSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [PhimoeSparseMoeBlock])
+
+    if model_type == "qwen2_moe":
+        from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [Qwen2MoeSparseMoeBlock])
+
+    if model_type == "qwen3_moe" or text_model_type == "qwen3_moe":  # internvl 3.5
+        from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [Qwen3MoeSparseMoeBlock])
+
+    if model_type == "qwen3_vl_moe":
+        from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [Qwen3VLMoeTextSparseMoeBlock])
+
+    if model_type in ("qwen3_omni_moe", "qwen3_omni_moe_thinker"):
+        from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import Qwen3OmniMoeThinkerTextSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [Qwen3OmniMoeThinkerTextSparseMoeBlock])
+
+
+def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
+    if not is_trainable or not model_args.moe_aux_loss_coef:
+        return
+
+    model_type = getattr(config, "model_type", None)
+    text_config = getattr(config, "text_config", None)  # for multimodal model
+
+    if model_type in [
+        "dbrx",
+        "ernie4_5_moe",
+        "granitemoe",
+        "jamba",
+        "jetmoe",
+        "llama4",
+        "mixtral",
+        "olmoe",
+        "phimoe",
+        "qwen2_moe",
+        "qwen3_moe",
+    ]:
+        setattr(config, "output_router_logits", True)
+
+    if text_config and getattr(text_config, "model_type", None) in [
+        "glm4v_moe_text",  # glmv4_5
+        "qwen3_moe",  # internvl_3_5
+    ]:
+        setattr(text_config, "output_router_logits", True)
+
+    if model_type in [
+        "ernie4_5_moe",
+        "granitemoe",
+        "jamba",
+        "llama4",
+        "mixtral",
+        "olmoe",
+        "phimoe",
+        "qwen2_moe",
+        "qwen3_moe",
+    ]:
+        setattr(config, "router_aux_loss_coef", model_args.moe_aux_loss_coef)
+
+    elif text_config and getattr(text_config, "model_type", None) in ["qwen3_moe"]:
+        setattr(text_config, "router_aux_loss_coef", model_args.moe_aux_loss_coef)
+
+    elif model_type == "deepseek":
+        setattr(config, "aux_loss_alpha", model_args.moe_aux_loss_coef)
+
+    elif model_type == "jetmoe":
+        setattr(config, "aux_loss_coef", model_args.moe_aux_loss_coef)
+
+
+class Qwen3OmniMoeThinkerTextSparseMoeBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.norm_topk_prob = config.norm_topk_prob
+
+        # gating
+        self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
+        self.experts = nn.ModuleList(
+            [
+                modeling_qwen3_omni_moe.Qwen3OmniMoeThinkerTextMLP(
+                    config, intermediate_size=config.moe_intermediate_size
+                )
+                for _ in range(self.num_experts)
+            ]
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        # Calculate the routing weights for all experts
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+
+        # Retain the weight of the top_k and reset the rest of the expert rights to 0 (instead of retaining only top_k experts)
+        top_k_weights, top_k_indices = torch.topk(routing_weights, self.top_k, dim=-1)
+        # Initialize the all-zero weight matrix (same shape as all experts)
+        full_routing_weights = torch.zeros_like(routing_weights)
+        # Only the weight of top_k experts is retained, and the weight of the rest of the experts remains at 0
+        full_routing_weights.scatter_(1, top_k_indices, top_k_weights)
+
+        # Normalized top_k weights (keep the original logic consistent)
+        if self.norm_topk_prob:
+            # Calculate the sum of the weights top_k each row (for normalization)
+            top_k_sum = full_routing_weights.sum(dim=-1, keepdim=True)
+            # Avoid dividing by zero
+            top_k_sum = torch.clamp(top_k_sum, min=1e-9)
+            full_routing_weights /= top_k_sum
+
+        # Convert back to the input data type
+        full_routing_weights = full_routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # Go through all the experts (not just the selected ones)
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            # Get the weight of the current expert (inactive expert has a weight of 0 here)
+            expert_weights = full_routing_weights[:, expert_idx, None]  # shape: (batch*seq, 1)
+            # All samples participate in the calculations of the current expert, the weight may be equal to 0
+            current_hidden_states = expert_layer(hidden_states) * expert_weights
+            # Add-up to all expert outputs (experts with a weight of 0 do not affect the result)
+            final_hidden_states += current_hidden_states
+
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/packing.py b/LlamaFactory/src/llamafactory/model/model_utils/packing.py
new file mode 100644
index 0000000000000000000000000000000000000000..66409f3863d0b176cbf981a1d0d2890688d94a8c
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/packing.py
@@ -0,0 +1,117 @@
+# Copyright 2025 Musab Gultekin and the LlamaFactory team.
+#
+# This code is based on the Musab Gultekin's functionary library.
+# https://github.com/MeetKai/functionary/blob/main/functionary/train/packing/monkey_patch_packing.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2023 Musab Gultekin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from typing import TYPE_CHECKING
+
+import torch
+import torch.nn.functional as F
+
+from ...extras import logging
+
+
+if TYPE_CHECKING:
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def get_seqlens_in_batch(attention_mask: "torch.Tensor") -> "torch.Tensor":
+    r"""Get the sequence lengths in the current batch.
+
+    e.g.
+    ```python
+    # input
+    [
+        [1, 1, 2, 2, 2, 0],
+        [1, 2, 2, 3, 3, 3],
+    ]
+    # output
+    [2, 3, 1, 2, 3]
+    ```
+    """
+    bsz = attention_mask.size(0)
+    dtype, device = attention_mask.dtype, attention_mask.device
+    max_num = torch.max(attention_mask).item()
+    counts: torch.Tensor = torch.zeros((bsz, max_num), dtype=dtype, device=device)
+    for i in range(max_num):
+        counts[:, i] = torch.sum(attention_mask == (i + 1), dim=-1)
+
+    counts = counts.flatten()
+    seqlens = counts[counts.nonzero().squeeze(dim=-1)]
+    return seqlens
+
+
+def get_unpad_data(attention_mask: "torch.Tensor") -> tuple["torch.Tensor", "torch.Tensor", int]:
+    r"""Prepare the indices and seqlens for flash attn varlen function.
+
+    Returns:
+        indices: indices of non-masked tokens from the flattened sequence.
+        cu_seqlens: the cumulative sequence lengths in the current batch, always starts from 0.
+        max_seqlen_in_batch: the largest seqlen in the current batch.
+
+    e.g.
+    ```python
+    # input
+    [
+        [1, 1, 2, 2, 2, 0],
+        [1, 2, 2, 3, 3, 3],
+    ]
+    # output
+    [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11]
+    [0, 2, 5, 6, 8, 11]
+    3
+    ```
+
+    """
+    seqlens_in_batch = get_seqlens_in_batch(attention_mask)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return indices, cu_seqlens, max_seqlen_in_batch
+
+
+def configure_packing(model_args: "ModelArguments", is_trainable: bool) -> None:
+    if not is_trainable or not model_args.block_diag_attn:
+        return
+
+    import transformers.modeling_flash_attention_utils
+
+    transformers.modeling_flash_attention_utils._get_unpad_data = get_unpad_data
+    logger.info_rank0("Using block diagonal attention for sequence packing without cross-attention.")
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/quantization.py b/LlamaFactory/src/llamafactory/model/model_utils/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ad607db3e1657360fa2b1ff0175a9f286a1ed4d
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/quantization.py
@@ -0,0 +1,216 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's Transformers and Optimum library.
+# https://github.com/huggingface/transformers/blob/v4.41.0/src/transformers/utils/quantization_config.py
+# https://github.com/huggingface/optimum/blob/v1.20.0/optimum/gptq/data.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+from typing import TYPE_CHECKING, Any
+
+import torch
+from datasets import load_dataset
+from transformers import BitsAndBytesConfig, EetqConfig, GPTQConfig, HqqConfig
+from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.modeling_utils import is_fsdp_enabled
+
+from ...extras import logging
+from ...extras.constants import FILEEXT2TYPE, QuantizationMethod
+from ...extras.misc import check_version, get_current_device
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedTokenizer
+
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> list[dict[str, Any]]:
+    r"""Prepare the tokenized dataset to perform AutoGPTQ. Do not use tensor output for JSON serialization."""
+    if os.path.isfile(model_args.export_quantization_dataset):
+        data_path = FILEEXT2TYPE.get(model_args.export_quantization_dataset.split(".")[-1], None)
+        data_files = model_args.export_quantization_dataset
+    else:
+        data_path = model_args.export_quantization_dataset
+        data_files = None
+
+    dataset = load_dataset(
+        path=data_path,
+        data_files=data_files,
+        split="train",
+        cache_dir=model_args.cache_dir,
+        token=model_args.hf_hub_token,
+    )
+
+    samples = []
+    maxlen = model_args.export_quantization_maxlen
+    for _ in range(model_args.export_quantization_nsamples):
+        n_try = 0
+        while True:
+            if n_try > 100:
+                raise ValueError("Cannot find satisfying example, considering decrease `export_quantization_maxlen`.")
+
+            sample_idx = random.randint(0, len(dataset) - 1)
+            sample: dict[str, torch.Tensor] = tokenizer(dataset[sample_idx]["text"], return_tensors="pt")
+            n_try += 1
+            if sample["input_ids"].size(1) > maxlen:
+                break  # TODO: fix large maxlen
+
+        word_idx = random.randint(0, sample["input_ids"].size(1) - maxlen - 1)
+        input_ids = sample["input_ids"][:, word_idx : word_idx + maxlen]
+        attention_mask = sample["attention_mask"][:, word_idx : word_idx + maxlen]
+        samples.append({"input_ids": input_ids.tolist(), "attention_mask": attention_mask.tolist()})
+
+    return samples
+
+
+def configure_quantization(
+    config: "PretrainedConfig",
+    tokenizer: "PreTrainedTokenizer",
+    model_args: "ModelArguments",
+    is_trainable: bool,
+    init_kwargs: dict[str, Any],
+) -> None:
+    r"""Priority: PTQ-quantized (train/infer) > AutoGPTQ (export) > On-the-fly quantization (train/infer)."""
+    if getattr(config, "quantization_config", None):  # ptq
+        if model_args.quantization_bit is not None:
+            logger.warning_rank0("`quantization_bit` will not affect on the PTQ-quantized models.")
+
+        quantization_config: dict[str, Any] = getattr(config, "quantization_config", None)
+        quant_method = quantization_config.get("quant_method", "")
+
+        if quant_method not in (QuantizationMethod.MXFP4, QuantizationMethod.FP8) and (
+            is_deepspeed_zero3_enabled() or is_fsdp_enabled()
+        ):
+            # mxfp4 will dequant the model weights
+            raise ValueError("DeepSpeed ZeRO-3 or FSDP is incompatible with PTQ-quantized models.")
+
+        if quant_method == QuantizationMethod.MXFP4:
+            from transformers import Mxfp4Config
+
+            quant_config = Mxfp4Config(dequantize=True)
+            init_kwargs["quantization_config"] = quant_config
+            init_kwargs["ignore_mismatched_sizes"] = True
+
+        if quant_method == QuantizationMethod.FP8:
+            from transformers import FineGrainedFP8Config
+
+            quant_config = FineGrainedFP8Config(dequantize=True)
+            init_kwargs["quantization_config"] = quant_config
+            init_kwargs["ignore_mismatched_sizes"] = True
+
+        if quant_method == QuantizationMethod.GPTQ:
+            check_version("gptqmodel>=2.0.0", mandatory=True)
+            quantization_config.pop("disable_exllama", None)  # remove deprecated args
+            quantization_config["use_exllama"] = False  # disable exllama
+
+        if quant_method == QuantizationMethod.AWQ:
+            check_version("autoawq", mandatory=True)
+
+        if quant_method == QuantizationMethod.AQLM:
+            check_version("aqlm>=1.1.0", mandatory=True)
+            quantization_config["bits"] = 2
+
+        quant_bits = quantization_config.get("bits", "?")
+        logger.info_rank0(f"Loading {quant_bits}-bit {quant_method.upper()}-quantized model.")
+
+    elif model_args.export_quantization_bit is not None:  # gptqmodel
+        if model_args.export_quantization_bit not in [8, 4, 3, 2]:
+            raise ValueError("AutoGPTQ only accepts 2/3/4/8-bit quantization.")
+
+        check_version("optimum>=1.24.0", mandatory=True)
+        check_version("gptqmodel>=2.0.0", mandatory=True)
+        from accelerate.utils import get_max_memory
+
+        if getattr(config, "model_type", None) == "chatglm":
+            raise ValueError("ChatGLM model is not supported yet.")
+
+        try:
+            from optimum.gptq import utils as gq_utils
+
+            if "language_model.model.layers" not in gq_utils.BLOCK_PATTERNS:
+                gq_utils.BLOCK_PATTERNS.insert(0, "language_model.model.layers")
+        except ImportError:
+            pass
+
+        block_name_to_quantize = None
+        if getattr(config, "model_type", None) in ["gemma3", "paligemma"]:
+            block_name_to_quantize = "language_model.model.layers"
+
+        init_kwargs["quantization_config"] = GPTQConfig(
+            bits=model_args.export_quantization_bit,
+            tokenizer=tokenizer,
+            dataset=_get_quantization_dataset(tokenizer, model_args),
+            block_name_to_quantize=block_name_to_quantize,
+        )
+        init_kwargs["device_map"] = "auto"
+        init_kwargs["max_memory"] = get_max_memory()
+        model_args.compute_dtype = torch.float16  # force fp16 for gptqmodel
+        logger.info_rank0(f"Quantizing model to {model_args.export_quantization_bit} bit with GPTQModel.")
+
+    elif model_args.quantization_bit is not None:  # on-the-fly
+        if model_args.quantization_method == QuantizationMethod.BNB:
+            if model_args.quantization_bit == 8:
+                check_version("bitsandbytes>=0.37.0", mandatory=True)
+                init_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
+            elif model_args.quantization_bit == 4:
+                check_version("bitsandbytes>=0.39.0", mandatory=True)
+                init_kwargs["quantization_config"] = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=model_args.compute_dtype,
+                    bnb_4bit_use_double_quant=model_args.double_quantization,
+                    bnb_4bit_quant_type=model_args.quantization_type,
+                    bnb_4bit_quant_storage=model_args.compute_dtype,  # crucial for fsdp+qlora
+                )
+            else:
+                raise ValueError("Bitsandbytes only accepts 4-bit or 8-bit quantization.")
+
+            # Do not assign device map if:
+            # 1. deepspeed zero3 or fsdp (train)
+            # 2. auto quantization device map (inference)
+            if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or model_args.quantization_device_map == "auto":
+                if model_args.quantization_bit != 4:
+                    raise ValueError("Only 4-bit quantized model can use fsdp+qlora or auto device map.")
+
+                check_version("bitsandbytes>=0.43.0", mandatory=True)
+            else:
+                init_kwargs["device_map"] = {"": get_current_device()}  # change auto device map for inference
+
+            logger.info_rank0(f"Quantizing model to {model_args.quantization_bit} bit with bitsandbytes.")
+        elif model_args.quantization_method == QuantizationMethod.HQQ:
+            if model_args.quantization_bit not in [8, 6, 5, 4, 3, 2, 1]:
+                raise ValueError("HQQ only accepts 1/2/3/4/5/6/8-bit quantization.")
+
+            if is_deepspeed_zero3_enabled() or is_fsdp_enabled():
+                raise ValueError("HQQ quantization is incompatible with DeepSpeed ZeRO-3 or FSDP.")
+
+            check_version("hqq", mandatory=True)
+            init_kwargs["quantization_config"] = HqqConfig(
+                nbits=model_args.quantization_bit, quant_zero=False, quant_scale=False, axis=0
+            )  # use ATEN kernel (axis=0) for performance
+            logger.info_rank0(f"Quantizing model to {model_args.quantization_bit} bit with HQQ.")
+        elif model_args.quantization_method == QuantizationMethod.EETQ:
+            if model_args.quantization_bit != 8:
+                raise ValueError("EETQ only accepts 8-bit quantization.")
+
+            if is_deepspeed_zero3_enabled() or is_fsdp_enabled():
+                raise ValueError("EETQ quantization is incompatible with DeepSpeed ZeRO-3 or FSDP.")
+
+            check_version("eetq", mandatory=True)
+            init_kwargs["quantization_config"] = EetqConfig()
+            logger.info_rank0(f"Quantizing model to {model_args.quantization_bit} bit with EETQ.")
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/rope.py b/LlamaFactory/src/llamafactory/model/model_utils/rope.py
new file mode 100644
index 0000000000000000000000000000000000000000..b217735b76d847296471336fe615db9d325520d8
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/rope.py
@@ -0,0 +1,84 @@
+# Copyright 2025 LMSYS and the LlamaFactory team.
+# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+# This code is inspired by the LMSYS's FastChat library.
+# https://github.com/lm-sys/FastChat/blob/v0.2.30/fastchat/train/train.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import TYPE_CHECKING
+
+from ...extras import logging
+from ...extras.constants import RopeScaling
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def configure_rope(config: "PretrainedConfig", model_args: "ModelArguments") -> None:
+    if model_args.rope_scaling is None:
+        return
+
+    if not hasattr(config, "rope_scaling"):
+        logger.warning_rank0("Current model does not support RoPE scaling.")
+        return
+
+    rope_scaling = getattr(config, "rope_scaling", None)
+    if isinstance(rope_scaling, dict) and "original_max_position_embeddings" in rope_scaling:
+        old_max_length = rope_scaling["original_max_position_embeddings"]
+    elif hasattr(config, "max_position_embeddings"):
+        old_max_length = getattr(config, "max_position_embeddings", None)
+    else:
+        logger.warning_rank0("Cannot find the max position embeddings in the config.")
+        return
+
+    if model_args.model_max_length is not None:  # training
+        if model_args.model_max_length <= old_max_length:
+            logger.warning_rank0("Input length is smaller than max length. Disabling rope scaling.")
+            return
+
+        if model_args.rope_scaling == RopeScaling.DYNAMIC:
+            logger.warning_rank0(
+                "Dynamic NTK scaling may not work well with fine-tuning. "
+                "See: https://github.com/huggingface/transformers/pull/24653"
+            )
+
+        rope_factor = float(math.ceil(model_args.model_max_length / old_max_length))
+    else:  # inference
+        rope_factor = 2.0
+
+    rope_kwargs = {
+        "rope_type": getattr(model_args.rope_scaling, "value", model_args.rope_scaling),  # handle enum
+        "factor": rope_factor,
+    }
+    setattr(config, "max_position_embeddings", old_max_length * rope_factor)
+    logger.info_rank0(f"Enlarge max model length from {old_max_length} to {old_max_length * rope_factor}.")
+
+    if model_args.rope_scaling in [RopeScaling.DYNAMIC, RopeScaling.YARN]:
+        rope_kwargs["original_max_position_embeddings"] = old_max_length
+    elif model_args.rope_scaling == RopeScaling.LLAMA3:
+        rope_kwargs["original_max_position_embeddings"] = old_max_length
+        rope_kwargs["low_freq_factor"] = 1.0
+        rope_kwargs["high_freq_factor"] = 4.0
+
+    setattr(config, "rope_scaling", rope_kwargs)
+    logger.info_rank0(
+        f"Using {rope_kwargs['rope_type']} scaling strategy and setting scaling factor to {rope_kwargs['factor']}."
+    )
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/unsloth.py b/LlamaFactory/src/llamafactory/model/model_utils/unsloth.py
new file mode 100644
index 0000000000000000000000000000000000000000..91e18dac966071491a061c241732f6be02c321f6
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/unsloth.py
@@ -0,0 +1,103 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any, Optional
+
+from ...extras import logging
+from ...extras.misc import get_current_device
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel
+
+    from ...hparams import FinetuningArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def _get_unsloth_kwargs(
+    config: "PretrainedConfig",
+    model_name_or_path: str,
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+) -> dict[str, Any]:
+    return {
+        "model_name": model_name_or_path,
+        "max_seq_length": model_args.model_max_length or 4096,
+        "dtype": model_args.compute_dtype,
+        "load_in_4bit": model_args.quantization_bit == 4,
+        "token": model_args.hf_hub_token,
+        "full_finetuning": finetuning_args.finetuning_type == "full",
+        "device_map": {"": get_current_device()},
+        "rope_scaling": getattr(config, "rope_scaling", None),
+        "fix_tokenizer": False,
+        "trust_remote_code": model_args.trust_remote_code,
+        "use_gradient_checkpointing": "unsloth",
+    }
+
+
+def load_unsloth_pretrained_model(
+    config: "PretrainedConfig", model_args: "ModelArguments", finetuning_args: "FinetuningArguments"
+) -> Optional["PreTrainedModel"]:
+    r"""Optionally load pretrained model with unsloth. Used in training."""
+    from unsloth import FastLanguageModel  # type: ignore
+
+    unsloth_kwargs = _get_unsloth_kwargs(config, model_args.model_name_or_path, model_args, finetuning_args)
+    try:
+        model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
+    except NotImplementedError:
+        logger.warning_rank0("Unsloth does not support model type {}.".format(getattr(config, "model_type", None)))
+        model = None
+        model_args.use_unsloth = False
+
+    return model
+
+
+def get_unsloth_peft_model(
+    model: "PreTrainedModel", model_args: "ModelArguments", peft_kwargs: dict[str, Any]
+) -> "PreTrainedModel":
+    r"""Get the peft model for the pretrained model with unsloth. Used in training."""
+    from unsloth import FastLanguageModel  # type: ignore
+
+    unsloth_peft_kwargs = {
+        "model": model,
+        "max_seq_length": model_args.model_max_length,
+        "use_gradient_checkpointing": "unsloth",
+    }
+    return FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs)
+
+
+def load_unsloth_peft_model(
+    config: "PretrainedConfig",
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
+) -> "PreTrainedModel":
+    r"""Load peft model with unsloth. Used in both training and inference."""
+    from unsloth import FastLanguageModel  # type: ignore
+
+    unsloth_kwargs = _get_unsloth_kwargs(config, model_args.adapter_name_or_path[0], model_args, finetuning_args)
+    try:
+        if not is_trainable:
+            unsloth_kwargs["use_gradient_checkpointing"] = False
+
+        model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
+    except NotImplementedError:
+        raise ValueError("Unsloth does not support model type {}.".format(getattr(config, "model_type", None)))
+
+    if not is_trainable:
+        FastLanguageModel.for_inference(model)
+
+    return model
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/valuehead.py b/LlamaFactory/src/llamafactory/model/model_utils/valuehead.py
new file mode 100644
index 0000000000000000000000000000000000000000..7409a22edd3a66f08b9b9c3b99d2e9354e32c4c3
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/valuehead.py
@@ -0,0 +1,72 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+import torch
+from transformers.utils import cached_file
+
+from ...extras import logging
+from ...extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def load_valuehead_params(path_or_repo_id: str, model_args: "ModelArguments") -> dict[str, torch.Tensor]:
+    r"""Load value head parameters from Hugging Face Hub or local disk.
+
+    Returns: dict with keys `v_head.summary.weight` and `v_head.summary.bias`.
+    """
+    kwargs = {"path_or_repo_id": path_or_repo_id, "cache_dir": model_args.cache_dir, "token": model_args.hf_hub_token}
+    err_text = ""
+
+    try:
+        from safetensors import safe_open
+
+        vhead_file = cached_file(filename=V_HEAD_SAFE_WEIGHTS_NAME, **kwargs)
+        with safe_open(vhead_file, framework="pt", device="cpu") as f:
+            return {key: f.get_tensor(key) for key in f.keys()}
+    except Exception as err:
+        err_text = str(err)
+
+    try:
+        vhead_file = cached_file(filename=V_HEAD_WEIGHTS_NAME, **kwargs)
+        return torch.load(vhead_file, map_location="cpu", weights_only=True)
+    except Exception as err:
+        err_text = str(err)
+
+    logger.info_rank0(f"Provided path ({path_or_repo_id}) does not contain value head weights: {err_text}.")
+    logger.info_rank0("Ignore the above message if you are not resuming the training of a value head model.")
+    return None
+
+
+def prepare_valuehead_model(model: "PreTrainedModel") -> None:
+    if getattr(model.config, "model_type", None) == "llava":
+        setattr(model, "lm_head", model.language_model.get_output_embeddings())
+        setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])
+
+    if getattr(model.config, "model_type", None) == "chatglm":
+        setattr(model, "lm_head", model.transformer.output_layer)
+        setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])
+
+    if getattr(model.config, "model_type", None) == "internlm2":
+        setattr(model, "lm_head", model.output)
+        setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])
diff --git a/LlamaFactory/src/llamafactory/model/model_utils/visual.py b/LlamaFactory/src/llamafactory/model/model_utils/visual.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d23b6e237fa88fa4326841dfd1c3c5ffd87a524
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/model_utils/visual.py
@@ -0,0 +1,391 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's Transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava/modeling_llava.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+import transformers
+import transformers.models
+from transformers.activations import ACT2FN
+
+from ...extras import logging
+from ...extras.packages import is_transformers_version_greater_than
+
+
+if TYPE_CHECKING:
+    from transformers import LlavaConfig, PretrainedConfig, PreTrainedModel
+
+    from ...hparams import FinetuningArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+transformers_logger = transformers.utils.logging.get_logger(__name__)
+
+
+@dataclass
+class CompositeModel:
+    model_type: str
+    projector_key: str
+    vision_model_keys: list[str]
+    language_model_keys: list[str]
+    lora_conflict_keys: list[str]
+
+    def get_projector(self, module: "torch.nn.Module") -> "torch.nn.Module":
+        for key in self.projector_key.split("."):
+            module = getattr(module, key)
+
+        return module
+
+
+COMPOSITE_MODELS: dict[str, "CompositeModel"] = {}
+
+
+def _register_composite_model(
+    model_type: str,
+    projector_key: Optional[str] = None,
+    vision_model_keys: Optional[list[str]] = None,
+    language_model_keys: Optional[list[str]] = None,
+    lora_conflict_keys: Optional[list[str]] = None,
+):
+    r"""Register a new composite model.
+
+    Args:
+        model_type: model type
+        projector_key: multi_modal_projector
+        vision_model_keys: vision_tower
+        language_model_keys: language_model
+        lora_conflict_keys: None
+
+    """
+    COMPOSITE_MODELS[model_type] = CompositeModel(
+        model_type=model_type,
+        projector_key=projector_key or "multi_modal_projector",
+        vision_model_keys=vision_model_keys or ["vision_tower"],
+        language_model_keys=language_model_keys or ["language_model", "lm_head"],
+        lora_conflict_keys=lora_conflict_keys or [],
+    )
+
+
+class LlavaMultiModalProjectorForYiVL(torch.nn.Module):
+    def __init__(self, config: "LlavaConfig") -> None:
+        super().__init__()
+
+        self.config = config
+        if config is None:
+            return
+
+        self.linear_1 = torch.nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_2 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True)
+        self.linear_3 = torch.nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_4 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True)
+        self.act = ACT2FN[config.projector_hidden_act]
+
+    def forward(self, image_features: "torch.Tensor") -> "torch.Tensor":
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.linear_2(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_3(hidden_states)
+        hidden_states = self.linear_4(hidden_states)
+        if hidden_states.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.linear_1.weight.dtype
+
+            transformers_logger.warning_once("The hidden states seems to be silently casted in float32.")
+            hidden_states = hidden_states.to(target_dtype)
+
+        return hidden_states
+
+
+class LlavaMultiModalProjectorForYiVLForVLLM(LlavaMultiModalProjectorForYiVL):
+    def __init__(self, vision_hidden_size: int, text_hidden_size: int, projector_hidden_act: str) -> None:
+        super().__init__(config=None)
+
+        self.linear_1 = torch.nn.Linear(vision_hidden_size, text_hidden_size, bias=True)
+        self.linear_2 = torch.nn.LayerNorm(text_hidden_size, bias=True)
+        self.linear_3 = torch.nn.Linear(text_hidden_size, text_hidden_size, bias=True)
+        self.linear_4 = torch.nn.LayerNorm(text_hidden_size, bias=True)
+        self.act = ACT2FN[projector_hidden_act]
+
+
+def autocast_projector_dtype(model: "PreTrainedModel", model_args: "ModelArguments") -> None:
+    r"""Cast projector output to half precision for fine-tuning quantized VLMs."""
+
+    def _mm_projector_forward_post_hook(
+        module: "torch.nn.Module", args: tuple["torch.Tensor"], output: "torch.Tensor"
+    ) -> "torch.Tensor":
+        return output.to(model_args.compute_dtype)
+
+    if getattr(model, "quantization_method", None):
+        model_type = getattr(model.config, "model_type", None)
+        if model_type in COMPOSITE_MODELS:
+            mm_projector = COMPOSITE_MODELS[model_type].get_projector(model)
+        else:
+            return
+
+        logger.info_rank0(f"Casting multimodal projector outputs in {model_args.compute_dtype}.")
+        mm_projector.register_forward_hook(_mm_projector_forward_post_hook)
+
+
+def configure_visual_model(config: "PretrainedConfig") -> None:
+    r"""Patch VLMs before loading them."""
+    if getattr(config, "text_config", None) and not getattr(config, "hidden_size", None):
+        # required for ds zero3 and valuehead models
+        setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
+
+    if getattr(config, "is_yi_vl_derived_model", None):
+        logger.info_rank0("Detected Yi-VL model, applying projector patch.")
+        transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjectorForYiVL
+
+
+def get_forbidden_modules(config: "PretrainedConfig", finetuning_args: "FinetuningArguments") -> set[str]:
+    r"""Freeze vision tower and language model for VLM full/freeze tuning."""
+    model_type = getattr(config, "model_type", None)
+    forbidden_modules = set()
+    if model_type in COMPOSITE_MODELS:
+        if finetuning_args.freeze_vision_tower:
+            vision_model_keys = COMPOSITE_MODELS[model_type].vision_model_keys
+            logger.info_rank0(f"Set vision model not trainable: {vision_model_keys}.")
+            forbidden_modules.update(vision_model_keys)
+
+        if finetuning_args.freeze_multi_modal_projector:
+            projector_key = COMPOSITE_MODELS[model_type].projector_key
+            logger.info_rank0(f"Set multi model projector not trainable: {projector_key}.")
+            forbidden_modules.add(projector_key)
+
+        if finetuning_args.freeze_language_model:
+            language_model_keys = COMPOSITE_MODELS[model_type].language_model_keys
+            logger.info_rank0(f"Set language model not trainable: {language_model_keys}.")
+            forbidden_modules.update(language_model_keys)
+
+    return forbidden_modules
+
+
+def patch_target_modules(
+    model: "PreTrainedModel", finetuning_args: "FinetuningArguments", target_modules: list[str]
+) -> list[str]:
+    r"""Freeze vision tower for VLM LoRA tuning."""
+    model_type = getattr(model.config, "model_type", None)
+    if model_type in COMPOSITE_MODELS:
+        forbidden_modules = get_forbidden_modules(model.config, finetuning_args)
+        forbidden_modules.update(COMPOSITE_MODELS[model_type].lora_conflict_keys)
+        module_names = []
+        for name, _ in model.named_modules():
+            if any(target_module in name for target_module in target_modules) and not any(
+                forbidden_module in name for forbidden_module in forbidden_modules
+            ):
+                module_names.append(name)
+
+        return module_names
+    else:
+        return target_modules
+
+
+_register_composite_model(
+    model_type="dots_ocr",
+    projector_key="vision_tower.merger",
+    vision_model_keys=["vision_tower"],
+    language_model_keys=["model", "lm_head"],
+    lora_conflict_keys=["merger"],
+)
+
+
+_register_composite_model(
+    model_type="gemma3",
+)
+
+
+_register_composite_model(
+    model_type="gemma3n",
+    vision_model_keys=["vision_tower", "audio_tower"],
+    lora_conflict_keys=["timm_model", "subsample_conv_projection"],
+)
+
+
+# copied from qwen2vl
+_register_composite_model(
+    model_type="glm4v",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks"],
+    language_model_keys=["language_model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="glm4v_moe",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks"],
+    language_model_keys=["language_model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="internvl",
+)
+
+_register_composite_model(
+    model_type="interns1",
+)
+
+_register_composite_model(
+    model_type="Keye",
+    projector_key="mlp_AR",
+    vision_model_keys=["visual.vision_model.patch_embedding", "visual.vision_model.encoder"],
+    language_model_keys=["model", "lm_head"],
+    lora_conflict_keys=["patch_embedding"],
+)
+
+
+_register_composite_model(
+    model_type="kimi_vl",
+)
+
+
+_register_composite_model(
+    model_type="llama4",
+    vision_model_keys=["vision_model"],
+)
+
+
+_register_composite_model(
+    model_type="llava",
+)
+
+
+_register_composite_model(
+    model_type="llava_next",
+)
+
+
+_register_composite_model(
+    model_type="llava_next_video",
+)
+
+
+_register_composite_model(
+    model_type="minicpmv",
+    projector_key="resampler",
+    vision_model_keys=["vpm"],
+    language_model_keys=["llm"],
+)
+
+
+_register_composite_model(
+    model_type="minicpmo",
+    projector_key="resampler",
+    vision_model_keys=["vpm", "apm", "audio_avg_pooler", "audio_projection_layer", "tts"],
+    language_model_keys=["llm"],
+    lora_conflict_keys=["audio_projection_layer"],
+)
+
+
+_register_composite_model(
+    model_type="mistral3",
+    projector_key="model.multi_modal_projector",
+)
+
+
+_register_composite_model(
+    model_type="mllama",
+    vision_model_keys=["vision_model"],
+)
+
+
+_register_composite_model(
+    model_type="paligemma",
+)
+
+
+_register_composite_model(
+    model_type="qwen2_audio",
+    vision_model_keys=["audio_tower"],
+)
+
+
+_register_composite_model(
+    model_type="qwen2_5_omni_thinker",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks", "audio_tower"],
+    language_model_keys=["model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="qwen2_vl",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks"],
+    language_model_keys=["language_model", "lm_head"]
+    if is_transformers_version_greater_than("4.52.0")
+    else ["model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="qwen2_5_vl",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks"],
+    language_model_keys=["language_model", "lm_head"]
+    if is_transformers_version_greater_than("4.52.0")
+    else ["model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="qwen3_vl",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.pos_embed", "visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list"],
+    language_model_keys=["language_model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="qwen3_vl_moe",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.pos_embed", "visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list"],
+    language_model_keys=["language_model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="qwen3_omni_moe_thinker",
+    projector_key="visual.merger",
+    vision_model_keys=[
+        "visual.pos_embed",
+        "visual.patch_embed",
+        "visual.blocks",
+        "visual.deepstack_merger_list",
+        "audio_tower",
+    ],
+    language_model_keys=["model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="video_llava",
+)
diff --git a/LlamaFactory/src/llamafactory/model/patcher.py b/LlamaFactory/src/llamafactory/model/patcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..52e8ace21f4c63f6b84754ddb2aba9df01416faf
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/model/patcher.py
@@ -0,0 +1,282 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from types import MethodType
+from typing import TYPE_CHECKING, Any
+
+import torch
+from peft import PeftModel
+from transformers import GenerationMixin, PreTrainedModel, PreTrainedTokenizerBase
+from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.modeling_utils import is_fsdp_enabled
+
+from ..extras import logging
+from ..extras.misc import infer_optim_dtype
+from ..extras.packages import is_transformers_version_greater_than
+from .model_utils.attention import configure_attn_implementation, print_attn_implementation
+from .model_utils.checkpointing import prepare_model_for_training
+from .model_utils.embedding import resize_embedding_layer
+from .model_utils.kv_cache import configure_kv_cache
+from .model_utils.longlora import configure_longlora
+from .model_utils.moe import add_z3_leaf_module, configure_moe
+from .model_utils.packing import configure_packing
+from .model_utils.quantization import configure_quantization
+from .model_utils.rope import configure_rope
+from .model_utils.valuehead import prepare_valuehead_model
+from .model_utils.visual import autocast_projector_dtype, configure_visual_model
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedTokenizer, ProcessorMixin
+    from trl import AutoModelForCausalLMWithValueHead
+
+    from ..hparams import ModelArguments
+
+if is_transformers_version_greater_than("4.57.0"):
+    from transformers.models.qwen3_omni_moe import modeling_qwen3_omni_moe
+
+
+logger = logging.get_logger(__name__)
+
+
+def patch_qwen3_omni_moe_thinker_text_sparse_moe_block():
+    if is_transformers_version_greater_than("4.57.0") and not is_transformers_version_greater_than("4.58.0"):
+        from .model_utils.moe import Qwen3OmniMoeThinkerTextSparseMoeBlock
+
+        logger.warning_rank0(
+            "You are using transformers with 4.x version, the Qwen3OmniMoeThinkerTextSparseMoeBlock will have some issues about deepspeed zero2 and fsdp2 training, so that we patched this model to avoid it. Transformers v5.0.0rc0 has fixed the issue, you can also try to update the transformers to using qwen3_omni. See more information on https://github.com/hiyouga/LLaMA-Factory/issues/9628."
+        )
+
+        modeling_qwen3_omni_moe.Qwen3OmniMoeThinkerTextSparseMoeBlock = Qwen3OmniMoeThinkerTextSparseMoeBlock
+
+
+def patch_youtu_vl_model(model: "PreTrainedModel") -> None:
+    original_forward = model.forward
+
+    def forward(self, *args, **kwargs):
+        outputs = original_forward(*args, **kwargs)
+        if "loss" not in outputs and "labels" in kwargs:
+            logits = outputs.get("logits")
+            labels = kwargs.get("labels")
+            if logits is not None and labels is not None:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                loss_fct = torch.nn.CrossEntropyLoss()
+                loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
+                outputs["loss"] = loss
+
+        return outputs
+
+    model.forward = MethodType(forward, model)
+
+
+def patch_tokenizer(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> None:
+    if "PreTrainedTokenizerBase" not in str(tokenizer._pad.__func__):
+        tokenizer._pad = MethodType(PreTrainedTokenizerBase._pad, tokenizer)
+
+    if model_args.model_max_length is not None and tokenizer.model_max_length < model_args.model_max_length:
+        tokenizer.model_max_length = model_args.model_max_length  # enlarge the tokenizer max length
+
+    if model_args.add_tokens is not None:
+        num_added_tokens = tokenizer.add_tokens(new_tokens=model_args.add_tokens, special_tokens=False)
+        logger.info_rank0("Add tokens {} to tokenizer's vocabulary.".format(",".join(model_args.add_tokens)))
+        if num_added_tokens > 0 and not model_args.resize_vocab:
+            model_args.resize_vocab = True
+            logger.warning_rank0("New tokens have been added, changed `resize_vocab` to True.")
+
+    if model_args.add_special_tokens is not None:
+        num_added_special_tokens = tokenizer.add_tokens(new_tokens=model_args.add_special_tokens, special_tokens=True)
+        logger.info_rank0(
+            "Add special tokens {} to tokenizer's vocabulary.".format(",".join(model_args.add_special_tokens))
+        )
+        if num_added_special_tokens > 0 and not model_args.resize_vocab:
+            model_args.resize_vocab = True
+            logger.warning_rank0("New special tokens have been added, changed `resize_vocab` to True.")
+
+
+def patch_processor(
+    processor: "ProcessorMixin",
+    tokenizer: "PreTrainedTokenizer",
+    model_args: "ModelArguments",
+) -> None:
+    setattr(processor, "tokenizer", tokenizer)
+    setattr(processor, "image_max_pixels", model_args.image_max_pixels)
+    setattr(processor, "image_min_pixels", model_args.image_min_pixels)
+    setattr(processor, "image_do_pan_and_scan", model_args.image_do_pan_and_scan)
+    setattr(processor, "crop_to_patches", model_args.crop_to_patches)
+    setattr(processor, "video_max_pixels", model_args.video_max_pixels)
+    setattr(processor, "video_min_pixels", model_args.video_min_pixels)
+    setattr(processor, "video_fps", model_args.video_fps)
+    setattr(processor, "video_maxlen", model_args.video_maxlen)
+    setattr(processor, "use_audio_in_video", model_args.use_audio_in_video)
+    setattr(processor, "audio_sampling_rate", model_args.audio_sampling_rate)
+
+
+def patch_config(
+    config: "PretrainedConfig",
+    tokenizer: "PreTrainedTokenizer",
+    model_args: "ModelArguments",
+    init_kwargs: dict[str, Any],
+    is_trainable: bool,
+) -> None:
+    if model_args.compute_dtype is None:  # priority: bf16 > fp16 > fp32
+        if model_args.infer_dtype != "auto" and not is_trainable:
+            model_args.compute_dtype = getattr(torch, model_args.infer_dtype)
+        else:
+            model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
+
+    configure_attn_implementation(config, model_args)
+    configure_rope(config, model_args)
+    configure_longlora(config, model_args, is_trainable)
+    configure_quantization(config, tokenizer, model_args, is_trainable, init_kwargs)
+    configure_moe(config, model_args, is_trainable)
+    configure_visual_model(config)
+    configure_packing(model_args, is_trainable)
+    configure_kv_cache(config, model_args, is_trainable)
+
+    if getattr(config, "model_type", None) == "qwen":
+        setattr(config, "use_flash_attn", model_args.flash_attn == "fa2")
+        for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]:
+            setattr(config, dtype_name, model_args.compute_dtype == dtype)
+
+    if getattr(config, "model_type", None) == "minicpmo":
+        setattr(config, "init_audio", True)
+        setattr(config, "init_tts", False)
+
+    # replace the top-k gating method
+    if getattr(config, "model_type", None) == "kimi_vl" and is_trainable:
+        setattr(config.text_config, "topk_method", "greedy")
+
+    architectures = getattr(config, "architectures", None)
+    if isinstance(architectures, list) and "InternVLChatModel" in architectures:
+        raise ValueError(
+            "Please download the internvl models in a Hugging Face–compatible format "
+            "(for example, https://huggingface.co/OpenGVLab/InternVL3-8B-hf)."
+        )
+
+    if isinstance(architectures, list) and "LlavaLlamaForCausalLM" in architectures:
+        raise ValueError("Please download llava models with hf-compatible format: https://huggingface.co/llava-hf")
+
+    if getattr(config, "model_type", None) == "internlm3" and not is_transformers_version_greater_than("4.47.1"):
+        raise RuntimeError("InternLM3 model requires transformers>=4.47.1, please upgrade it.")
+
+    if getattr(config, "model_type", None) == "lfm2_vl" and not is_transformers_version_greater_than("4.58.0"):
+        raise RuntimeError(
+            "LFM2.5-VL model requires transformers>=4.58.0 or install from commit: "
+            "pip install git+https://github.com/huggingface/transformers.git@3c2517727ce28a30f5044e01663ee204deb1cdbe"
+        )
+
+    if getattr(config, "model_type", None) == "qwen3_omni_moe":
+        patch_qwen3_omni_moe_thinker_text_sparse_moe_block()
+
+    # deepspeed zero3 is not compatible with low_cpu_mem_usage
+    init_kwargs["low_cpu_mem_usage"] = model_args.low_cpu_mem_usage and (not is_deepspeed_zero3_enabled())
+
+    # fsdp/deepspeed zero3 does not need device map
+    if not (is_deepspeed_zero3_enabled() or is_fsdp_enabled()) and init_kwargs["low_cpu_mem_usage"]:
+        if "device_map" not in init_kwargs and model_args.device_map:
+            init_kwargs["device_map"] = model_args.device_map  # device map requires low_cpu_mem_usage=True
+
+        if init_kwargs.get("device_map", None) == "auto":
+            init_kwargs["offload_folder"] = model_args.offload_folder
+
+
+def patch_model(
+    model: "PreTrainedModel",
+    tokenizer: "PreTrainedTokenizer",
+    model_args: "ModelArguments",
+    is_trainable: bool,
+    add_valuehead: bool,
+) -> None:
+    gen_config = model.generation_config  # check and fix generation config
+    if not gen_config.do_sample and (
+        (gen_config.temperature is not None and gen_config.temperature != 1.0)
+        or (gen_config.top_p is not None and gen_config.top_p != 1.0)
+        or (gen_config.typical_p is not None and gen_config.typical_p != 1.0)
+    ):
+        gen_config.do_sample = True
+
+    if getattr(model.config, "model_type", None) not in ["minicpmv", "minicpmo"] and "GenerationMixin" not in str(
+        model.generate.__func__
+    ):
+        model.generate = MethodType(GenerationMixin.generate, model)
+
+    if add_valuehead:
+        prepare_valuehead_model(model)
+
+    if model_args.resize_vocab:
+        resize_embedding_layer(
+            model,
+            tokenizer,
+            new_special_tokens_config=getattr(model_args, "_special_token_descriptions", None),
+            init_special_tokens=model_args.init_special_tokens,
+        )
+
+    if is_trainable:
+        if getattr(model.config, "model_type", None) == "gemma3n":
+            setattr(model_args, "disable_gradient_checkpointing", True)
+
+        if getattr(model.config, "model_type", None) == "youtu_vl":
+            patch_youtu_vl_model(model)
+
+        prepare_model_for_training(model, model_args)
+        autocast_projector_dtype(model, model_args)
+        add_z3_leaf_module(model)
+
+    if not model_args.use_unsloth:
+        print_attn_implementation(model.config)
+
+    try:
+        model.add_model_tags(["llama-factory"])
+    except Exception:
+        logger.warning_rank0("Cannot properly tag the model.")
+
+
+def patch_valuehead_model(model: "AutoModelForCausalLMWithValueHead") -> None:
+    def tie_weights(self: "AutoModelForCausalLMWithValueHead") -> None:
+        if isinstance(self.pretrained_model, PreTrainedModel):
+            self.pretrained_model.tie_weights()
+
+    def get_input_embeddings(self: "AutoModelForCausalLMWithValueHead") -> torch.nn.Module:
+        if isinstance(self.pretrained_model, PreTrainedModel):
+            return self.pretrained_model.get_input_embeddings()
+
+    def get_output_embeddings(self: "AutoModelForCausalLMWithValueHead") -> torch.nn.Module:
+        if isinstance(self.pretrained_model, PreTrainedModel):
+            return self.pretrained_model.get_output_embeddings()
+
+    def create_or_update_model_card(self: "AutoModelForCausalLMWithValueHead", output_dir: str) -> None:
+        if isinstance(self.pretrained_model, PeftModel):
+            self.pretrained_model.create_or_update_model_card(output_dir)
+
+    def get_rope_index_func(self: "AutoModelForCausalLMWithValueHead"):
+        if isinstance(self.pretrained_model, PeftModel):
+            base_model = self.pretrained_model.base_model.model
+        else:
+            base_model = self.pretrained_model
+
+        if base_model and hasattr(base_model, "get_rope_index"):
+            return base_model.get_rope_index
+        elif base_model and hasattr(base_model, "model") and hasattr(base_model.model, "get_rope_index"):
+            return base_model.model.get_rope_index
+        else:
+            return None
+
+    ignore_modules = [name for name, _ in model.named_parameters() if "pretrained_model" in name]
+    setattr(model, "_keys_to_ignore_on_save", ignore_modules)
+    setattr(model, "tie_weights", MethodType(tie_weights, model))
+    setattr(model, "get_input_embeddings", MethodType(get_input_embeddings, model))
+    setattr(model, "get_output_embeddings", MethodType(get_output_embeddings, model))
+    setattr(model, "get_rope_index", get_rope_index_func(model))
+    setattr(model, "create_or_update_model_card", MethodType(create_or_update_model_card, model))
diff --git a/LlamaFactory/src/llamafactory/third_party/__init__.py b/LlamaFactory/src/llamafactory/third_party/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/third_party/muon/__init__.py b/LlamaFactory/src/llamafactory/third_party/muon/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..afa615d0df2163ea271fff8e128aa8c81224dd8f
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/third_party/muon/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .muon import Muon
+
+
+__all__ = ["Muon"]
diff --git a/LlamaFactory/src/llamafactory/third_party/muon/muon.py b/LlamaFactory/src/llamafactory/third_party/muon/muon.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7482c36b8196cf1dfaa8b0ae441b1cfd7ae5dc3
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/third_party/muon/muon.py
@@ -0,0 +1,226 @@
+# Copyright 2025 Moonshot AI and the LlamaFactory team.
+#
+# This code is based on the MoonshotAI's Moonlight library.
+# https://github.com/MoonshotAI/Moonlight/blob/master/examples/toy_train.py
+# and the Keller Jordan's Muon library.
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2025 Moonshot AI
+# Copyright (c) 2024 Keller Jordan
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import math
+
+import torch
+
+
+def zeropower_via_newtonschulz5(G: "torch.Tensor", steps: int) -> "torch.Tensor":
+    """Newton-Schulz iteration to compute the zeroth power / orthogonalization of G.
+
+    We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero.
+    For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing
+    the slope at zero even beyond the point where the iteration no longer converges all the way to
+    one everywhere on the interval. This iteration therefore does not produce UV^T but rather something
+    like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    X = G.bfloat16()
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    # Perform the NS iterations
+    for _ in range(steps):
+        A = X @ X.T
+        B = b * A + c * A @ A  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        X = a * X + B @ X
+
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X
+
+
+class Muon(torch.optim.Optimizer):
+    """Muon - MomentUm Orthogonalized by Newton-schulz.
+
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+
+    Arguments:
+        muon_params: The parameters to be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        adamw_wd: The weight decay for the internal AdamW.
+    """
+
+    def __init__(
+        self,
+        lr=1e-3,
+        wd=0.1,
+        muon_params=None,
+        momentum=0.95,
+        nesterov=True,
+        ns_steps=5,
+        adamw_params=None,
+        adamw_betas=(0.9, 0.95),
+        adamw_eps=1e-8,
+    ):
+        defaults = dict(
+            lr=lr,
+            wd=wd,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+        )
+
+        params = list(muon_params)
+        adamw_params = list(adamw_params) if adamw_params is not None else []
+        params.extend(adamw_params)
+        super().__init__(params, defaults)
+        # Sort parameters into those for which we will use Muon, and those for which we will not
+        for p in muon_params:
+            # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
+            assert p.ndim == 2, p.ndim
+            self.state[p]["use_muon"] = True
+        for p in adamw_params:
+            # Do not use Muon for parameters in adamw_params
+            self.state[p]["use_muon"] = False
+
+    def adjust_lr_for_muon(self, lr: float, param_shape: list[int]) -> float:
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+
+    def step(self, closure=None):
+        """Perform a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            # Muon loop
+            params = [p for p in group["params"] if self.state[p]["use_muon"]]
+            lr = group["lr"]
+            wd = group["wd"]
+            momentum = group["momentum"]
+
+            # generate weight updates in distributed fashion
+            for p in params:
+                # sanity check
+                g = p.grad
+                if g is None:
+                    continue
+                if g.ndim > 2:
+                    g = g.view(g.size(0), -1)
+                assert g is not None
+
+                # calc update
+                state = self.state[p]
+                if "momentum_buffer" not in state:
+                    state["momentum_buffer"] = torch.zeros_like(g)
+                buf = state["momentum_buffer"]
+                buf.mul_(momentum).add_(g)
+                if group["nesterov"]:
+                    g = g.add(buf, alpha=momentum)
+                else:
+                    g = buf
+                u = zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
+
+                # scale update
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+
+                # apply weight decay
+                p.data.mul_(1 - lr * wd)
+
+                # apply update
+                p.data.add_(u, alpha=-adjusted_lr)
+
+            # Adam backup
+            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
+            lr = group["lr"]
+            beta1, beta2 = group["adamw_betas"]
+            eps = group["adamw_eps"]
+            weight_decay = group["wd"]
+
+            for p in params:
+                g = p.grad
+                if g is None:
+                    continue
+                state = self.state[p]
+                if "step" not in state:
+                    state["step"] = 0
+                    state["moment1"] = torch.zeros_like(g)
+                    state["moment2"] = torch.zeros_like(g)
+                state["step"] += 1
+                step = state["step"]
+                buf1 = state["moment1"]
+                buf2 = state["moment2"]
+                buf1.lerp_(g, 1 - beta1)
+                buf2.lerp_(g.square(), 1 - beta2)
+
+                g = buf1 / (eps + buf2.sqrt())
+
+                bias_correction1 = 1 - beta1**step
+                bias_correction2 = 1 - beta2**step
+                scale = bias_correction1 / bias_correction2**0.5
+                p.data.mul_(1 - lr * weight_decay)
+                p.data.add_(g, alpha=-lr / scale)
+
+        return loss
diff --git a/LlamaFactory/src/llamafactory/train/__init__.py b/LlamaFactory/src/llamafactory/train/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/train/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc858a3fa64a9dec1f2b095bbaa88ccf14445d08
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/__pycache__/callbacks.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/__pycache__/callbacks.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad7b22d6c0fe7616823958835eeed255b46456e2
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/__pycache__/callbacks.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/__pycache__/fp8_utils.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/__pycache__/fp8_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62807a1e1141021c844f2dcd829e0ac6de2778d8
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/__pycache__/fp8_utils.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/__pycache__/trainer_utils.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/__pycache__/trainer_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..992d47936a894ec5d3ad0de84319aaca622c9f10
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/__pycache__/trainer_utils.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/__pycache__/tuner.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/__pycache__/tuner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1cdb3194e338a5c0372e5375e8236acc692adfd8
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/__pycache__/tuner.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/callbacks.py b/LlamaFactory/src/llamafactory/train/callbacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..d164c04439965cea46b29d1fbd9356ba51ec9b2b
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/callbacks.py
@@ -0,0 +1,384 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import signal
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor
+from datetime import timedelta
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+import transformers
+from peft import PeftModel
+from transformers import PreTrainedModel, ProcessorMixin, TrainerCallback
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length
+from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
+from typing_extensions import override
+
+from ..extras import logging
+from ..extras.constants import TRAINER_LOG, V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
+from ..extras.misc import get_peak_memory, is_env_enabled, use_ray
+from ..extras.packages import is_safetensors_available
+
+
+if is_safetensors_available():
+    from safetensors import safe_open
+    from safetensors.torch import save_file
+
+
+if TYPE_CHECKING:
+    from transformers import TrainerControl, TrainerState, TrainingArguments
+    from trl import AutoModelForCausalLMWithValueHead
+
+    from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def fix_valuehead_checkpoint(
+    model: "AutoModelForCausalLMWithValueHead", output_dir: str, safe_serialization: bool
+) -> None:
+    r"""Fix the valuehead checkpoint files.
+
+    The model is already unwrapped.
+
+    There are three cases:
+    1. full tuning without ds_zero3: state_dict = {"model.layers.*": ..., "v_head.summary.*": ...}
+    2. lora tuning without ds_zero3: state_dict = {"v_head.summary.*": ...}
+    3. under deepspeed zero3: state_dict = {"pretrained_model.model.layers.*": ..., "v_head.summary.*": ...}
+
+    We assume `stage3_gather_16bit_weights_on_model_save=true`.
+    """
+    if not isinstance(model.pretrained_model, (PreTrainedModel, PeftModel)):
+        return
+
+    if safe_serialization:
+        path_to_checkpoint = os.path.join(output_dir, SAFE_WEIGHTS_NAME)
+        with safe_open(path_to_checkpoint, framework="pt", device="cpu") as f:
+            state_dict: dict[str, torch.Tensor] = {key: f.get_tensor(key).clone() for key in f.keys()}
+    else:
+        path_to_checkpoint = os.path.join(output_dir, WEIGHTS_NAME)
+        state_dict: dict[str, torch.Tensor] = torch.load(path_to_checkpoint, map_location="cpu", weights_only=True)
+
+    os.remove(path_to_checkpoint)
+    decoder_state_dict, v_head_state_dict = {}, {}
+    for name, param in state_dict.items():
+        if name.startswith("v_head."):
+            v_head_state_dict[name] = param
+        else:
+            decoder_state_dict[name.replace("pretrained_model.", "", 1)] = param
+
+    model.pretrained_model.save_pretrained(
+        output_dir, state_dict=decoder_state_dict or None, safe_serialization=safe_serialization
+    )
+
+    if safe_serialization:
+        save_file(v_head_state_dict, os.path.join(output_dir, V_HEAD_SAFE_WEIGHTS_NAME), metadata={"format": "pt"})
+    else:
+        torch.save(v_head_state_dict, os.path.join(output_dir, V_HEAD_WEIGHTS_NAME))
+
+    logger.info_rank0(f"Value head model saved at: {output_dir}")
+
+
+class FixValueHeadModelCallback(TrainerCallback):
+    r"""A callback for fixing the checkpoint for valuehead models."""
+
+    @override
+    def on_save(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if args.should_save:
+            output_dir = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
+            fix_valuehead_checkpoint(
+                model=kwargs.pop("model"),
+                output_dir=output_dir,
+                safe_serialization=getattr(args, "save_safetensors", True),
+            )
+
+
+class SaveProcessorCallback(TrainerCallback):
+    r"""A callback for saving the processor."""
+
+    def __init__(self, processor: "ProcessorMixin") -> None:
+        self.processor = processor
+
+    @override
+    def on_save(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if args.should_save:
+            output_dir = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
+            self.processor.save_pretrained(output_dir)
+
+    @override
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if args.should_save:
+            self.processor.save_pretrained(args.output_dir)
+
+
+class PissaConvertCallback(TrainerCallback):
+    r"""A callback for converting the PiSSA adapter to a normal one."""
+
+    @override
+    def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if args.should_save:
+            model = kwargs.pop("model")
+            pissa_init_dir = os.path.join(args.output_dir, "pissa_init")
+            logger.info_rank0(f"Initial PiSSA adapter will be saved at: {pissa_init_dir}.")
+            if isinstance(model, PeftModel):
+                init_lora_weights = getattr(model.peft_config["default"], "init_lora_weights")
+                setattr(model.peft_config["default"], "init_lora_weights", True)
+                model.save_pretrained(pissa_init_dir, safe_serialization=getattr(args, "save_safetensors", True))
+                setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights)
+
+    @override
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if args.should_save:
+            model = kwargs.pop("model")
+            pissa_init_dir = os.path.join(args.output_dir, "pissa_init")
+            pissa_backup_dir = os.path.join(args.output_dir, "pissa_backup")
+            pissa_convert_dir = os.path.join(args.output_dir, "pissa_converted")
+            logger.info_rank0(f"Converted PiSSA adapter will be saved at: {pissa_convert_dir}.")
+            # 1. save a pissa backup with init_lora_weights: True
+            # 2. save a converted lora with init_lora_weights: pissa
+            # 3. load the pissa backup with init_lora_weights: True
+            # 4. delete the initial adapter and change init_lora_weights to pissa
+            if isinstance(model, PeftModel):
+                init_lora_weights = getattr(model.peft_config["default"], "init_lora_weights")
+                setattr(model.peft_config["default"], "init_lora_weights", True)
+                model.save_pretrained(pissa_backup_dir, safe_serialization=getattr(args, "save_safetensors", True))
+                setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights)
+                model.save_pretrained(
+                    pissa_convert_dir,
+                    safe_serialization=getattr(args, "save_safetensors", True),
+                    path_initial_model_for_weight_conversion=pissa_init_dir,
+                )
+                model.load_adapter(pissa_backup_dir, "default", is_trainable=True)
+                model.set_adapter("default")
+                setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights)
+
+
+class LogCallback(TrainerCallback):
+    r"""A callback for logging training and evaluation status."""
+
+    def __init__(self) -> None:
+        # Progress
+        self.start_time = 0
+        self.cur_steps = 0
+        self.max_steps = 0
+        self.elapsed_time = ""
+        self.remaining_time = ""
+        self.thread_pool: Optional[ThreadPoolExecutor] = None
+        # Status
+        self.aborted = False
+        self.do_train = False
+        # Web UI
+        self.webui_mode = is_env_enabled("LLAMABOARD_ENABLED")
+        if self.webui_mode and not use_ray():
+            signal.signal(signal.SIGABRT, self._set_abort)
+            self.logger_handler = logging.LoggerHandler(os.getenv("LLAMABOARD_WORKDIR"))
+            logging.add_handler(self.logger_handler)
+            transformers.logging.add_handler(self.logger_handler)
+
+    def _set_abort(self, signum, frame) -> None:
+        self.aborted = True
+
+    def _reset(self, max_steps: int = 0) -> None:
+        self.start_time = time.time()
+        self.cur_steps = 0
+        self.max_steps = max_steps
+        self.elapsed_time = ""
+        self.remaining_time = ""
+
+    def _timing(self, cur_steps: int) -> None:
+        cur_time = time.time()
+        elapsed_time = cur_time - self.start_time
+        avg_time_per_step = elapsed_time / cur_steps if cur_steps != 0 else 0
+        remaining_time = (self.max_steps - cur_steps) * avg_time_per_step
+        self.cur_steps = cur_steps
+        self.elapsed_time = str(timedelta(seconds=int(elapsed_time)))
+        self.remaining_time = str(timedelta(seconds=int(remaining_time)))
+
+    def _write_log(self, output_dir: str, logs: dict[str, Any]) -> None:
+        with open(os.path.join(output_dir, TRAINER_LOG), "a", encoding="utf-8") as f:
+            f.write(json.dumps(logs) + "\n")
+
+    def _create_thread_pool(self, output_dir: str) -> None:
+        os.makedirs(output_dir, exist_ok=True)
+        self.thread_pool = ThreadPoolExecutor(max_workers=1)
+
+    def _close_thread_pool(self) -> None:
+        if self.thread_pool is not None:
+            self.thread_pool.shutdown(wait=True)
+            self.thread_pool = None
+
+    @override
+    def on_init_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if (
+            args.should_save
+            and os.path.exists(os.path.join(args.output_dir, TRAINER_LOG))
+            and args.overwrite_output_dir
+        ):
+            logger.warning_rank0_once("Previous trainer log in this folder will be deleted.")
+            os.remove(os.path.join(args.output_dir, TRAINER_LOG))
+
+    @override
+    def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if args.should_save:
+            self.do_train = True
+            self._reset(max_steps=state.max_steps)
+            self._create_thread_pool(output_dir=args.output_dir)
+
+    @override
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        self._close_thread_pool()
+
+    @override
+    def on_substep_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if self.aborted:
+            control.should_epoch_stop = True
+            control.should_training_stop = True
+
+    @override
+    def on_step_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if self.aborted:
+            control.should_epoch_stop = True
+            control.should_training_stop = True
+
+    @override
+    def on_evaluate(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if not self.do_train:
+            self._close_thread_pool()
+
+    @override
+    def on_predict(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if not self.do_train:
+            self._close_thread_pool()
+
+    @override
+    def on_log(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if not args.should_save:
+            return
+
+        self._timing(cur_steps=state.global_step)
+        logs = dict(
+            current_steps=self.cur_steps,
+            total_steps=self.max_steps,
+            loss=state.log_history[-1].get("loss"),
+            eval_loss=state.log_history[-1].get("eval_loss"),
+            predict_loss=state.log_history[-1].get("predict_loss"),
+            reward=state.log_history[-1].get("reward"),
+            accuracy=state.log_history[-1].get("rewards/accuracies"),
+            lr=state.log_history[-1].get("learning_rate"),
+            epoch=state.log_history[-1].get("epoch"),
+            percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
+            elapsed_time=self.elapsed_time,
+            remaining_time=self.remaining_time,
+        )
+        if state.num_input_tokens_seen:
+            logs["throughput"] = round(state.num_input_tokens_seen / (time.time() - self.start_time), 2)
+            logs["total_tokens"] = state.num_input_tokens_seen
+
+        if is_env_enabled("RECORD_VRAM"):
+            vram_allocated, vram_reserved = get_peak_memory()
+            logs["vram_allocated"] = round(vram_allocated / (1024**3), 2)
+            logs["vram_reserved"] = round(vram_reserved / (1024**3), 2)
+
+        logs = {k: v for k, v in logs.items() if v is not None}
+        if self.webui_mode and all(key in logs for key in ("loss", "lr", "epoch")):
+            log_str = f"'loss': {logs['loss']:.4f}, 'learning_rate': {logs['lr']:2.4e}, 'epoch': {logs['epoch']:.2f}"
+            for extra_key in ("reward", "accuracy", "throughput"):
+                if logs.get(extra_key):
+                    log_str += f", '{extra_key}': {logs[extra_key]:.2f}"
+
+            logger.info_rank0("{" + log_str + "}")
+
+        if self.thread_pool is not None:
+            self.thread_pool.submit(self._write_log, args.output_dir, logs)
+
+    @override
+    def on_prediction_step(
+        self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs
+    ):
+        if self.do_train:
+            return
+
+        if self.aborted:
+            sys.exit(0)
+
+        if not args.should_save:
+            return
+
+        eval_dataloader = kwargs.pop("eval_dataloader", None)
+        if has_length(eval_dataloader):
+            if self.max_steps == 0:
+                self._reset(max_steps=len(eval_dataloader))
+                self._create_thread_pool(output_dir=args.output_dir)
+
+            self._timing(cur_steps=self.cur_steps + 1)
+            if self.cur_steps % 5 == 0 and self.thread_pool is not None:
+                logs = dict(
+                    current_steps=self.cur_steps,
+                    total_steps=self.max_steps,
+                    percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
+                    elapsed_time=self.elapsed_time,
+                    remaining_time=self.remaining_time,
+                )
+                self.thread_pool.submit(self._write_log, args.output_dir, logs)
+
+
+class ReporterCallback(TrainerCallback):
+    r"""A callback for reporting training status to external logger."""
+
+    def __init__(
+        self,
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+    ) -> None:
+        self.model_args = model_args
+        self.data_args = data_args
+        self.finetuning_args = finetuning_args
+        self.generating_args = generating_args
+        os.environ["WANDB_PROJECT"] = os.getenv("WANDB_PROJECT", "llamafactory")
+
+    @override
+    def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        if "wandb" in args.report_to:
+            import wandb
+
+            wandb.config.update(
+                {
+                    "model_args": self.model_args.to_dict(),
+                    "data_args": self.data_args.to_dict(),
+                    "finetuning_args": self.finetuning_args.to_dict(),
+                    "generating_args": self.generating_args.to_dict(),
+                }
+            )
+
+        if self.finetuning_args.use_swanlab:
+            import swanlab  # type: ignore
+
+            swanlab.config.update(
+                {
+                    "model_args": self.model_args.to_dict(),
+                    "data_args": self.data_args.to_dict(),
+                    "finetuning_args": self.finetuning_args.to_dict(),
+                    "generating_args": self.generating_args.to_dict(),
+                }
+            )
diff --git a/LlamaFactory/src/llamafactory/train/dpo/__init__.py b/LlamaFactory/src/llamafactory/train/dpo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c1a4a6bd8a6c68c6875f19fdf8eb9899e70826
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/dpo/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_dpo
+
+
+__all__ = ["run_dpo"]
diff --git a/LlamaFactory/src/llamafactory/train/dpo/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/dpo/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4958de9532a3f5f15acb475fb8a6260187c3bfff
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/dpo/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/dpo/__pycache__/workflow.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/dpo/__pycache__/workflow.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20cbc393152da3c1a7ae70c9d0f47d6b92e34f63
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/dpo/__pycache__/workflow.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/dpo/ktrainer.py b/LlamaFactory/src/llamafactory/train/dpo/ktrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0da2c6851fba900832793b7c76080dc2ad81254d
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/dpo/ktrainer.py
@@ -0,0 +1,62 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/dpo_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+import torch
+from ktransformers.sft.lora import KTrainer  # type: ignore
+from typing_extensions import override
+
+from ..trainer_utils import get_batch_logps, nested_detach
+from .trainer import CustomDPOTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+
+class KDPOTrainer(KTrainer, CustomDPOTrainer):
+    @override
+    def concatenated_forward(
+        self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"], is_ref_model: bool = False
+    ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        r"""Compute the sum log probabilities of the labels under given logits if loss_type is not IPO, ORPO or SimPO.
+
+        Otherwise the average log probabilities.
+        """
+        if self.finetuning_args.use_ref_model:
+            batch = nested_detach(batch, clone=True)  # avoid error
+
+        labels = batch.pop("labels")  # dpo do not need compute loss in forward
+        all_logits: torch.Tensor = model(**batch, return_dict=True, use_cache=False).logits.to(torch.float32)
+        all_logits = all_logits.to("cpu")
+        labels = labels.to(all_logits.device)
+        all_logps, valid_length = get_batch_logps(
+            logits=all_logits, labels=labels, ld_alpha=(self.ld_alpha if not is_ref_model else None)
+        )
+        if self.loss_type in ["ipo", "orpo", "simpo"]:
+            all_logps = all_logps / valid_length
+
+        batch_size = batch["input_ids"].size(0) // 2
+        chosen_logps, rejected_logps = all_logps.split(batch_size, dim=0)
+        chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0)
+        chosen_length, _ = valid_length.split(batch_size, dim=0)
+
+        if self.loss_type in ["ipo", "orpo", "simpo"]:
+            return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps
+        else:
+            return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps / chosen_length
diff --git a/LlamaFactory/src/llamafactory/train/dpo/trainer.py b/LlamaFactory/src/llamafactory/train/dpo/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7780e20ee73b6404c0a176e6ae08271f051fbacb
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/dpo/trainer.py
@@ -0,0 +1,342 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/dpo_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from collections import defaultdict
+from contextlib import nullcontext
+from types import MethodType
+from typing import TYPE_CHECKING, Literal, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from transformers import Trainer
+from trl import DPOTrainer
+from trl.trainer import disable_dropout_in_model
+from trl.trainer.utils import prepare_deepspeed
+from typing_extensions import override
+
+from ...extras.constants import IGNORE_INDEX
+from ...extras.packages import is_transformers_version_greater_than
+from ..callbacks import SaveProcessorCallback
+from ..trainer_utils import create_custom_optimizer, create_custom_scheduler, get_batch_logps, nested_detach
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, ProcessorMixin
+
+    from ...hparams import FinetuningArguments
+
+
+class CustomDPOTrainer(DPOTrainer):
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", torch.nn.Module],
+        ref_model: Optional[Union["PreTrainedModel", torch.nn.Module]],
+        finetuning_args: "FinetuningArguments",
+        processor: Optional["ProcessorMixin"],
+        disable_dropout: bool = True,
+        **kwargs,
+    ):
+        if is_transformers_version_greater_than("4.46"):
+            kwargs["processing_class"] = kwargs.pop("tokenizer")
+
+        if disable_dropout:
+            disable_dropout_in_model(model)
+            if ref_model is not None:
+                disable_dropout_in_model(ref_model)
+
+        self.finetuning_args = finetuning_args
+        self.f_divergence_type = "reverse_kl"
+        self.reference_free = False
+        self.use_dpo_data_collator = True  # hack to avoid warning
+        self.generate_during_eval = False  # disable at evaluation
+        self.label_pad_token_id = IGNORE_INDEX
+        self.padding_value = 0
+        self.is_encoder_decoder = model.config.is_encoder_decoder
+        self.precompute_ref_log_probs = False
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+        self._peft_has_been_casted_to_bf16 = False
+
+        self.ref_model = ref_model
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # dpo hyperparams
+        self.beta = finetuning_args.pref_beta
+        self.loss_type = finetuning_args.pref_loss
+        self.ftx_gamma = finetuning_args.pref_ftx
+        self.bco_gemma = finetuning_args.pref_bco_weight
+        self.label_smoothing = finetuning_args.dpo_label_smoothing
+        self.simpo_gamma = finetuning_args.simpo_gamma
+        self.ld_alpha = finetuning_args.ld_alpha
+
+        Trainer.__init__(self, model=model, **kwargs)
+        self.model_accepts_loss_kwargs = False  # overwrite trainer's default behavior
+        if not hasattr(self, "accelerator"):
+            raise AttributeError("Please update `transformers`.")
+
+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
+
+        if ref_model is not None:
+            if self.is_deepspeed_enabled:
+                if not (
+                    getattr(ref_model, "is_loaded_in_8bit", False) or getattr(ref_model, "is_loaded_in_4bit", False)
+                ):  # quantized models are already set on the correct device
+                    self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+                self.ref_model.eval()
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
+        if finetuning_args.use_badam:
+            from badam import BAdamCallback, clip_grad_norm_old_version  # type: ignore
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.add_callback(BAdamCallback)
+
+        if self.bco_gemma >= 1e-6:
+            from trl.trainer import RunningMoments
+
+            self.running = RunningMoments(self.accelerator)
+
+    @override
+    def create_optimizer(self) -> "torch.optim.Optimizer":
+        if self.optimizer is None:
+            self.optimizer = create_custom_optimizer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
+
+    @override
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
+
+    @override
+    def _get_train_sampler(self, *args, **kwargs) -> Optional["torch.utils.data.Sampler"]:
+        if self.finetuning_args.disable_shuffling:
+            return torch.utils.data.SequentialSampler(self.train_dataset)
+
+        return super()._get_train_sampler(*args, **kwargs)
+
+    @override
+    def get_batch_samples(self, *args, **kwargs):
+        r"""Replace the method of DPO Trainer with the one of the standard Trainer."""
+        return Trainer.get_batch_samples(self, *args, **kwargs)
+
+    def odds_ratio_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor":
+        r"""Compute ORPO's odds ratio (OR) loss for batched log probabilities of the policy model."""
+        log_odds = (chosen_logps - rejected_logps) - (
+            torch.log1p(-torch.exp(chosen_logps)) - torch.log1p(-torch.exp(rejected_logps))
+        )
+        sft_loss = -chosen_logps
+        odds_ratio_loss = -F.logsigmoid(log_odds)
+        orpo_loss = sft_loss + self.beta * odds_ratio_loss
+        return orpo_loss
+
+    def simpo_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor":
+        r"""Compute SimPO loss for batched log probabilities of the policy model."""
+        pi_logratios = chosen_logps - rejected_logps
+        gamma_logratios = self.simpo_gamma / self.beta
+        logits = pi_logratios - gamma_logratios
+        simpo_loss = -F.logsigmoid(self.beta * logits)
+        return simpo_loss
+
+    def bco_loss(
+        self,
+        chosen_logps: "torch.Tensor",
+        rejected_logps: "torch.Tensor",
+        reference_chosen_logps: "torch.Tensor",
+        reference_rejected_logps: "torch.Tensor",
+    ) -> "torch.Tensor":
+        chosen_logratios = chosen_logps - reference_chosen_logps
+        rejected_logratios = rejected_logps - reference_rejected_logps
+        chosen_rewards = self.beta * chosen_logratios
+        rejected_rewards = self.beta * rejected_logratios
+        rewards = torch.cat((chosen_rewards, rejected_rewards), 0).mean().detach()
+        self.running.update(rewards)  # update baseline
+        delta = self.running.mean
+        bco_loss = -F.logsigmoid((self.beta * chosen_logratios) - delta) - F.logsigmoid(
+            -(self.beta * rejected_logratios - delta)
+        )
+        return bco_loss
+
+    def compute_preference_loss(
+        self,
+        policy_chosen_logps: "torch.Tensor",
+        policy_rejected_logps: "torch.Tensor",
+        reference_chosen_logps: Optional["torch.Tensor"],
+        reference_rejected_logps: Optional["torch.Tensor"],
+    ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        r"""Compute loss for preference learning."""
+        if not self.finetuning_args.use_ref_model:
+            if self.loss_type == "orpo":
+                losses = self.odds_ratio_loss(policy_chosen_logps, policy_rejected_logps)
+            elif self.loss_type == "simpo":
+                losses = self.simpo_loss(policy_chosen_logps, policy_rejected_logps)
+            else:
+                raise NotImplementedError(f"Unknown loss type: {self.loss_type}.")
+
+            chosen_rewards = self.beta * policy_chosen_logps.to(self.accelerator.device).detach()
+            rejected_rewards = self.beta * policy_rejected_logps.to(self.accelerator.device).detach()
+        else:
+            losses, chosen_rewards, rejected_rewards = self.dpo_loss(
+                policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps
+            )
+
+            if self.bco_gemma > 1e-6:
+                bco_losses = self.bco_loss(
+                    policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps
+                )
+                losses = (losses + bco_losses * self.bco_gemma) / (1.0 + self.bco_gemma)  # re-weight W_p and W_q
+
+        return losses, chosen_rewards, rejected_rewards
+
+    @override
+    def concatenated_forward(
+        self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"], is_ref_model: bool = False
+    ) -> dict[str, "torch.Tensor"]:
+        r"""Compute the sum log probabilities of the labels under given logits if loss_type is not IPO, ORPO or SimPO.
+
+        Otherwise the average log probabilities.
+        """
+        if self.finetuning_args.use_ref_model:
+            batch = nested_detach(batch, clone=True)  # avoid error
+
+        labels = batch.pop("labels")  # dpo do not need compute loss in forward
+        all_logits: torch.Tensor = model(**batch, return_dict=True, use_cache=False).logits.to(torch.float32)
+        all_logps, valid_length = get_batch_logps(
+            logits=all_logits, labels=labels, ld_alpha=(self.ld_alpha if not is_ref_model else None)
+        )
+        if self.loss_type in ["ipo", "orpo", "simpo"]:
+            all_logps = all_logps / valid_length
+
+        batch_size = batch["input_ids"].size(0) // 2
+        chosen_logps, rejected_logps = all_logps.split(batch_size, dim=0)
+        chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0)
+        chosen_length, _ = valid_length.split(batch_size, dim=0)
+        if self.loss_type in ["ipo", "orpo", "simpo"]:
+            chosen_logps_avg = chosen_logps
+        else:
+            chosen_logps_avg = chosen_logps / chosen_length
+
+        return {
+            "chosen_logps": chosen_logps,
+            "rejected_logps": rejected_logps,
+            "chosen_logits": chosen_logits,
+            "rejected_logits": rejected_logits,
+            "chosen_logps_avg": chosen_logps_avg,
+        }
+
+    @override
+    def compute_reference_log_probs(
+        self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"]
+    ) -> tuple[Optional["torch.Tensor"], Optional["torch.Tensor"]]:
+        r"""Compute log probabilities of the reference model."""
+        if not self.finetuning_args.use_ref_model:
+            return None, None
+
+        if self.ref_model is None:
+            ref_model = model
+            ref_context = self.accelerator.unwrap_model(model).disable_adapter()
+        else:
+            ref_model = self.ref_model
+            ref_context = nullcontext()
+
+        with torch.no_grad(), ref_context:
+            ref_output = self.concatenated_forward(ref_model, batch, is_ref_model=True)
+            reference_chosen_logps = ref_output["chosen_logps"]
+            reference_rejected_logps = ref_output["rejected_logps"]
+
+        return reference_chosen_logps, reference_rejected_logps
+
+    @override
+    def get_batch_loss_metrics(
+        self,
+        model: "PreTrainedModel",
+        batch: dict[str, "torch.Tensor"],
+        train_eval: Literal["train", "eval"] = "train",
+    ) -> tuple["torch.Tensor", dict[str, "torch.Tensor"]]:
+        r"""Compute the DPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+
+        model_output = self.concatenated_forward(model, batch)
+        policy_chosen_logps = model_output["chosen_logps"]
+        policy_rejected_logps = model_output["rejected_logps"]
+        policy_chosen_logits = model_output["chosen_logits"]
+        policy_rejected_logits = model_output["rejected_logits"]
+        policy_chosen_logps_avg = model_output["chosen_logps_avg"]
+
+        reference_chosen_logps, reference_rejected_logps = self.compute_reference_log_probs(model, batch)
+        losses, chosen_rewards, rejected_rewards = self.compute_preference_loss(
+            policy_chosen_logps,
+            policy_rejected_logps,
+            reference_chosen_logps,
+            reference_rejected_logps,
+        )
+        sft_loss = -policy_chosen_logps_avg
+        if self.ftx_gamma > 1e-6:
+            losses += self.ftx_gamma * sft_loss
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = chosen_rewards.mean().item()
+        metrics[f"{prefix}rewards/rejected"] = rejected_rewards.mean().item()
+        metrics[f"{prefix}rewards/accuracies"] = (chosen_rewards > rejected_rewards).float().mean().item()
+        metrics[f"{prefix}rewards/margins"] = (chosen_rewards - rejected_rewards).mean().item()
+        metrics[f"{prefix}logps/chosen"] = policy_chosen_logps.mean().item()
+        metrics[f"{prefix}logps/rejected"] = policy_rejected_logps.mean().item()
+        metrics[f"{prefix}logits/chosen"] = policy_chosen_logits.mean().item()
+        metrics[f"{prefix}logits/rejected"] = policy_rejected_logits.mean().item()
+        if self.loss_type == "orpo":
+            metrics[f"{prefix}sft_loss"] = sft_loss.mean().item()
+            metrics[f"{prefix}odds_ratio_loss"] = ((losses - sft_loss) / self.beta).mean().item()
+
+        return losses.mean(), metrics
+
+    @override
+    def compute_loss(
+        self, model: "PreTrainedModel", inputs: dict[str, "torch.Tensor"], return_outputs: bool = False, **kwargs
+    ) -> Union["torch.Tensor", tuple["torch.Tensor", list["torch.Tensor"]]]:
+        r"""Subclass and override to accept extra kwargs."""
+        return super().compute_loss(model, inputs, return_outputs)
+
+    @override
+    def log(self, logs: dict[str, float], *args, **kwargs) -> None:
+        r"""Log `logs` on the various objects watching training, including stored metrics."""
+        # logs either has "loss" or "eval_loss"
+        train_eval = "train" if "loss" in logs else "eval"
+        # Add averaged stored metrics to logs
+        key_list, metric_list = [], []
+        for key, metrics in self._stored_metrics[train_eval].items():
+            key_list.append(key)
+            metric_list.append(torch.tensor(metrics, dtype=torch.float).to(self.accelerator.device).mean().item())
+
+        del self._stored_metrics[train_eval]
+        if len(metric_list) < 10:  # pad to for all reduce
+            for i in range(10 - len(metric_list)):
+                key_list.append(f"dummy_{i}")
+                metric_list.append(0.0)
+
+        metric_list = torch.tensor(metric_list, dtype=torch.float).to(self.accelerator.device)
+        metric_list = self.accelerator.reduce(metric_list, "mean").tolist()
+        for key, metric in zip(key_list, metric_list):  # add remaining items
+            if not key.startswith("dummy_"):
+                logs[key] = metric
+
+        return Trainer.log(self, logs, *args, **kwargs)
diff --git a/LlamaFactory/src/llamafactory/train/dpo/workflow.py b/LlamaFactory/src/llamafactory/train/dpo/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..83ad38dfa6dfe0fe08ae78ad61e802af9ed495c7
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/dpo/workflow.py
@@ -0,0 +1,119 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/dpo.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional
+
+from ...data import PairwiseDataCollatorWithPadding, get_dataset, get_template_and_fix_tokenizer
+from ...extras.constants import IGNORE_INDEX
+from ...extras.misc import calculate_tps
+from ...extras.ploting import plot_loss
+from ...hparams import ModelArguments
+from ...model import load_model, load_tokenizer
+from ..trainer_utils import create_modelcard_and_push, create_ref_model
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments
+
+
+def run_dpo(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="rm", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+
+    data_collator = PairwiseDataCollatorWithPadding(
+        template=template,
+        model=model,
+        pad_to_multiple_of=8,
+        label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id,
+        **tokenizer_module,
+    )
+
+    # Create reference model
+    if finetuning_args.use_ref_model:
+        if finetuning_args.ref_model is None and (not training_args.do_train):  # use the model itself
+            ref_model = model
+        else:
+            ref_model = create_ref_model(model_args, finetuning_args)
+    else:
+        ref_model = None
+
+    if model_args.use_kt:
+        from ktransformers.util.globals import GLOBAL_CONFIG  # type: ignore
+
+        from .ktrainer import KDPOTrainer as CustomDPOTrainer
+
+        GLOBAL_CONFIG._config["mod"] = "sft"
+
+    else:
+        from .trainer import CustomDPOTrainer
+
+    # Initialize our Trainer
+    trainer = CustomDPOTrainer(
+        model=model,
+        ref_model=ref_model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **dataset_module,
+        **tokenizer_module,
+    )
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        if finetuning_args.include_effective_tokens_per_second:
+            train_result.metrics["effective_tokens_per_sec"] = calculate_tps(
+                dataset_module["train_dataset"], train_result.metrics, stage="rm"
+            )
+
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            keys = ["loss", "rewards/accuracies"]
+            if isinstance(dataset_module.get("eval_dataset"), dict):
+                keys += [f"eval_{key}_loss" for key in dataset_module["eval_dataset"].keys()]
+            else:
+                keys += ["eval_loss"]
+
+            plot_loss(training_args.output_dir, keys=keys)
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(metric_key_prefix="eval")
+        if id(model) == id(ref_model):  # unable to compute rewards if reference model is the model itself
+            remove_keys = [key for key in metrics.keys() if "rewards" in key]
+            for key in remove_keys:
+                metrics.pop(key)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/LlamaFactory/src/llamafactory/train/fp8_utils.py b/LlamaFactory/src/llamafactory/train/fp8_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..33728feadc3d9b1b4e22306b99f22130c0eff7c4
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/fp8_utils.py
@@ -0,0 +1,229 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import types
+from typing import TYPE_CHECKING, Any, Optional
+
+from ..extras import logging
+
+
+if TYPE_CHECKING:
+    from ..hparams import TrainingArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def create_fp8_kwargs(training_args: "TrainingArguments") -> list[Any]:
+    """Create AORecipeKwargs for FP8 training with HuggingFace Accelerate.
+
+    Args:
+        training_args: Training arguments containing FP8 configuration
+
+    Returns:
+        List containing AORecipeKwargs if FP8 is enabled and supported, empty list otherwise
+    """
+    if not training_args.fp8:
+        return []
+
+    backend = getattr(training_args, "fp8_backend", "auto")
+    logger.info_rank0(f"Creating FP8 configuration with backend: {backend}")
+
+    try:
+        # Use Transformer Engine backend (optimal for Hopper GPUs)
+        if backend == "te":
+            from accelerate.utils import FP8RecipeKwargs
+
+            logger.info_rank0("Using Transformer Engine FP8 backend")
+            return [FP8RecipeKwargs(backend="TE", fp8_format="HYBRID", amax_history_len=16, amax_compute_algo="max")]
+
+        # Use TorchAO backend (default)
+        from accelerate.utils import AORecipeKwargs
+
+        # Create Float8LinearConfig if torchao backend is used
+        config = None
+        if backend == "torchao" or backend == "auto":
+            from torchao.float8 import Float8LinearConfig
+
+            # Use rowwise scaling for better performance (as recommended by torchao)
+            # Configure alignment requirements for FP8 kernels
+            config = Float8LinearConfig.from_recipe_name("rowwise")
+
+            # Enable alignment for better kernel performance
+            if hasattr(config, "enable_amax_init"):
+                config.enable_amax_init = True
+            if hasattr(config, "enable_pre_and_post_forward"):
+                config.enable_pre_and_post_forward = True
+
+        # Create module filter function to skip problematic layers
+        # TorchAO FP8 requires dimensions divisible by 16 for optimal kernels
+        def module_filter_func(module, layer_name):
+            # Skip embedding and output layers for numerical stability
+            skip_layers = ["embed", "lm_head", "output", "classifier"]
+            if any(skip_name in layer_name.lower() for skip_name in skip_layers):
+                return False
+
+            # Only convert Linear layers
+            if not (hasattr(module, "weight") and len(module.weight.shape) == 2):
+                return False
+
+            # Check dimension alignment for FP8 kernels
+            weight = module.weight
+            in_features, out_features = weight.shape[1], weight.shape[0]
+
+            # Skip layers with dimensions not divisible by 16 to avoid kernel errors
+            if in_features % 16 != 0 or out_features % 16 != 0:
+                logger.debug(
+                    f"Skipping layer {layer_name} with dimensions {out_features}x{in_features} (not divisible by 16)"
+                )
+                return False
+
+            return True
+
+        # Map FSDP all-gather setting if available (this affects the underlying implementation)
+        if (
+            hasattr(training_args, "fp8_enable_fsdp_float8_all_gather")
+            and training_args.fp8_enable_fsdp_float8_all_gather
+        ):
+            logger.info_rank0("FSDP float8 all-gather optimization requested")
+
+        return [AORecipeKwargs(config=config, module_filter_func=module_filter_func)]
+    except Exception as e:
+        logger.info_rank0(f"Failed to create FP8 configuration: {e}")
+        return []
+
+
+def get_fp8_mixed_precision(training_args: "TrainingArguments") -> Optional[str]:
+    """Get the mixed precision setting for Accelerate when using FP8.
+
+    Args:
+        training_args: Training arguments containing FP8 configuration
+
+    Returns:
+        "fp8" if FP8 is enabled, None otherwise
+    """
+    return "fp8" if training_args.fp8 else None
+
+
+def configure_fp8_environment(training_args: "TrainingArguments") -> None:
+    """Configure FP8 environment for HuggingFace Accelerate.
+
+    FP8 training is handled entirely through HuggingFace Accelerate, regardless of whether
+    DeepSpeed or FSDP is used for distributed training. This function sets up the environment
+    variables and validates the FP8 configuration.
+
+    Args:
+        training_args: Training arguments containing FP8 configuration
+    """
+    if not training_args.fp8:
+        return
+
+    # Set mixed precision to fp8 for HuggingFace Accelerate
+    os.environ["ACCELERATE_MIXED_PRECISION"] = "fp8"
+    logger.info_rank0("Set ACCELERATE_MIXED_PRECISION=fp8")
+
+    # Configure FP8 backend and options
+    backend = getattr(training_args, "fp8_backend", "auto")
+    if backend != "auto":
+        os.environ["FP8_BACKEND"] = backend
+        logger.info_rank0(f"Set FP8_BACKEND={backend}")
+
+    # Create and validate FP8 recipe kwargs (for logging/debugging)
+    fp8_kwargs = create_fp8_kwargs(training_args)
+    logger.info_rank0(f"FP8 AORecipeKwargs created: {len(fp8_kwargs)} items")
+
+    # Enable FSDP float8 all-gather optimization if requested
+    if hasattr(training_args, "fp8_enable_fsdp_float8_all_gather") and training_args.fp8_enable_fsdp_float8_all_gather:
+        os.environ["FP8_ENABLE_FSDP_FLOAT8_ALL_GATHER"] = "true"
+        logger.info_rank0("Set FP8_ENABLE_FSDP_FLOAT8_ALL_GATHER=true")
+
+    logger.info_rank0("FP8 environment configured - all FP8 training handled by HuggingFace Accelerate")
+
+
+def verify_fp8_status(accelerator, training_args: "TrainingArguments") -> None:
+    """Verify that FP8 training is actually working after model preparation.
+
+    Args:
+        accelerator: The HuggingFace Accelerator instance
+        training_args: Training arguments containing FP8 configuration
+    """
+    if not training_args.fp8:
+        return
+
+    # Check Accelerate's FP8 status
+    fp8_enabled = getattr(accelerator, "fp8_enabled", False)
+    fp8_backend_type = getattr(accelerator, "fp8_backend", "UNKNOWN")
+
+    backend = getattr(training_args, "fp8_backend", "auto")
+    if backend == "torchao" or backend == "auto":
+        logger.info_rank0(
+            "FP8 training enabled with TorchAO backend. For optimal performance, "
+            "ensure model layer dimensions are mostly divisible by 16. "
+            "If you encounter issues, try fp8_backend='te' with Transformer Engine."
+        )
+    else:
+        logger.info_rank0(f"FP8 training enabled with {backend} backend.")
+
+    logger.info_rank0(f"Accelerate FP8 status - enabled: {fp8_enabled}, backend: {fp8_backend_type}")
+
+    if not fp8_enabled:
+        logger.info_rank0("WARNING: FP8 was requested but Accelerate shows fp8_enabled=False. FP8 may not be working.")
+
+
+def patch_accelerator_for_fp8() -> None:
+    """Patch Accelerator to inject FP8 recipe kwargs.
+
+    This is needed because HuggingFace Trainer doesn't pass kwargs_handlers to Accelerator.
+    We monkey-patch Accelerator.__init__ to inject the FP8 recipe and force mixed_precision='fp8'.
+    """
+    import transformer_engine.pytorch as te
+    from accelerate import Accelerator
+
+    # Guard against multiple patches
+    if getattr(Accelerator, "_te_fp8_patched", False):
+        return
+
+    # Stub for Accelerate 1.12+ compatibility (te.fp8.check_mxfp8_support doesn't exist yet)
+    if not hasattr(te, "fp8"):
+        te.fp8 = types.ModuleType("fp8")
+        te.fp8.check_mxfp8_support = lambda: (False, "MXFP8 not supported")
+
+    try:
+        from accelerate.utils import TERecipeKwargs as FP8Recipe
+
+        use_te_recipe = True
+    except ImportError:
+        from accelerate.utils import FP8RecipeKwargs as FP8Recipe
+
+        use_te_recipe = False
+
+    original_init = Accelerator.__init__
+
+    def patched_init(self, *args, **kwargs):
+        if "kwargs_handlers" not in kwargs or not kwargs["kwargs_handlers"]:
+            if use_te_recipe:
+                kwargs["kwargs_handlers"] = [
+                    FP8Recipe(fp8_format="HYBRID", amax_history_len=16, amax_compute_algo="max")
+                ]
+            else:
+                kwargs["kwargs_handlers"] = [
+                    FP8Recipe(backend="TE", fp8_format="HYBRID", amax_history_len=16, amax_compute_algo="max")
+                ]
+            # Only force mixed_precision when we inject handlers
+            kwargs["mixed_precision"] = "fp8"
+        return original_init(self, *args, **kwargs)
+
+    Accelerator.__init__ = patched_init
+    Accelerator._te_fp8_patched = True
diff --git a/LlamaFactory/src/llamafactory/train/kto/__init__.py b/LlamaFactory/src/llamafactory/train/kto/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..491b067e41c53641f989d7dc17a22d6765f5684d
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/kto/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_kto
+
+
+__all__ = ["run_kto"]
diff --git a/LlamaFactory/src/llamafactory/train/kto/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/kto/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d1f0946b7bf7823e0b1bd8aa68cc43b5b0a94f8
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/kto/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/kto/__pycache__/trainer.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/kto/__pycache__/trainer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d80386b7be5b271fd39ed8d382c82f8c617beae
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/kto/__pycache__/trainer.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/kto/__pycache__/workflow.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/kto/__pycache__/workflow.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6dc46c8fb1e74f307444dceace347fb5a6bbf16f
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/kto/__pycache__/workflow.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/kto/trainer.py b/LlamaFactory/src/llamafactory/train/kto/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..eea92f3be6b4112cbccc673ac32e1abc9f7ba27b
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/kto/trainer.py
@@ -0,0 +1,305 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/kto_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from collections import defaultdict
+from contextlib import nullcontext
+from types import MethodType
+from typing import TYPE_CHECKING, Literal, Optional, Union
+
+import torch
+from transformers import Trainer
+from trl import KTOTrainer
+from trl.trainer import disable_dropout_in_model
+from trl.trainer.utils import prepare_deepspeed
+from typing_extensions import override
+
+from ...extras.constants import IGNORE_INDEX
+from ...extras.packages import is_transformers_version_greater_than
+from ..callbacks import SaveProcessorCallback
+from ..trainer_utils import create_custom_optimizer, create_custom_scheduler, get_batch_logps, nested_detach
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, ProcessorMixin
+
+    from ...hparams import FinetuningArguments
+
+
+class CustomKTOTrainer(KTOTrainer):
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", torch.nn.Module],
+        ref_model: Optional[Union["PreTrainedModel", torch.nn.Module]],
+        finetuning_args: "FinetuningArguments",
+        processor: Optional["ProcessorMixin"],
+        disable_dropout: bool = True,
+        **kwargs,
+    ):
+        if is_transformers_version_greater_than("4.46"):
+            kwargs["processing_class"] = kwargs.pop("tokenizer")
+
+        if disable_dropout:
+            disable_dropout_in_model(model)
+            if ref_model is not None:
+                disable_dropout_in_model(ref_model)
+
+        self.finetuning_args = finetuning_args
+        self.reference_free = False
+        self.use_dpo_data_collator = True  # hack to avoid warning
+        self.generate_during_eval = False  # disable at evaluation
+        self.label_pad_token_id = IGNORE_INDEX
+        self.padding_value = 0
+        self.is_encoder_decoder = model.config.is_encoder_decoder
+        self.precompute_ref_log_probs = False
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+        self._peft_has_been_casted_to_bf16 = False
+
+        self.ref_model = ref_model
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # kto hyperparams
+        self.beta = finetuning_args.pref_beta
+        self.desirable_weight = finetuning_args.kto_chosen_weight
+        self.undesirable_weight = finetuning_args.kto_rejected_weight
+        self.ftx_gamma = finetuning_args.pref_ftx
+        # trl
+        # Not all losses require a KL calculation
+        self.calculate_KL = True
+        if hasattr(self, "loss_type") and self.loss_type in ["apo_zero_unpaired"]:
+            self.calculate_KL = False
+        else:
+            self.loss_type = "kto"
+
+        Trainer.__init__(self, model=model, **kwargs)
+        self.model_accepts_loss_kwargs = False  # overwrite trainer's default behavior
+        if not hasattr(self, "accelerator"):
+            raise AttributeError("Please update `transformers`.")
+
+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
+
+        if ref_model is not None:
+            if self.is_deepspeed_enabled:
+                if not (
+                    getattr(ref_model, "is_loaded_in_8bit", False) or getattr(ref_model, "is_loaded_in_4bit", False)
+                ):  # quantized models are already set on the correct device
+                    self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+                self.ref_model.eval()
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
+        if finetuning_args.use_badam:
+            from badam import BAdamCallback, clip_grad_norm_old_version  # type: ignore
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.add_callback(BAdamCallback)
+
+    @override
+    def create_optimizer(self) -> "torch.optim.Optimizer":
+        if self.optimizer is None:
+            self.optimizer = create_custom_optimizer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
+
+    @override
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
+
+    @override
+    def _get_train_sampler(self, *args, **kwargs) -> Optional["torch.utils.data.Sampler"]:
+        r"""Replace the sequential sampler of KTO Trainer created by trl with the random sampler."""
+        if self.finetuning_args.disable_shuffling:
+            return torch.utils.data.SequentialSampler(self.train_dataset)
+
+        return Trainer._get_train_sampler(self, *args, **kwargs)
+
+    @override
+    def get_batch_samples(self, *args, **kwargs):
+        r"""Replace the method of KTO Trainer with the one of the standard Trainer."""
+        return Trainer.get_batch_samples(self, *args, **kwargs)
+
+    @override
+    def forward(
+        self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"], prefix: Literal["", "kl_"] = ""
+    ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        r"""Run forward pass and computes the log probabilities."""
+        batch = nested_detach(batch, clone=True)  # avoid error
+        model_inputs = {
+            "input_ids": batch[f"{prefix}input_ids"],
+            "attention_mask": batch[f"{prefix}attention_mask"],
+        }
+        if f"{prefix}token_type_ids" in batch:
+            model_inputs["token_type_ids"] = batch[f"{prefix}token_type_ids"]
+
+        if "pixel_values" in batch:
+            model_inputs["pixel_values"] = batch["pixel_values"]
+
+        if "image_sizes" in batch:
+            model_inputs["image_sizes"] = batch["image_sizes"]
+
+        if "image_grid_thw" in batch:
+            model_inputs["image_grid_thw"] = batch["image_grid_thw"]
+
+        if "aspect_ratio_ids" in batch:
+            model_inputs["aspect_ratio_ids"] = batch["aspect_ratio_ids"]
+
+        if "aspect_ratio_mask" in batch:
+            model_inputs["aspect_ratio_mask"] = batch["aspect_ratio_mask"]
+
+        if f"{prefix}cross_attention_mask" in batch:
+            model_inputs["cross_attention_mask"] = batch[f"{prefix}cross_attention_mask"]
+
+        logits = model(**model_inputs, return_dict=True, use_cache=False).logits.to(torch.float32)
+        logps, valid_length = get_batch_logps(logits=logits, labels=batch[f"{prefix}labels"])
+        return logits, logps, logps / valid_length
+
+    @override
+    def concatenated_forward(
+        self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"]
+    ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        target_logits, target_logps, target_logps_avg = self.forward(model, batch)
+        with torch.no_grad():
+            _, kl_logps, _ = self.forward(model, batch, prefix="kl_")
+
+        if len(target_logps) != len(batch["kto_tags"]):
+            raise ValueError("Mismatched shape of inputs and labels.")
+
+        chosen_logits = target_logits[batch["kto_tags"]]
+        chosen_logps = target_logps[batch["kto_tags"]]
+        rejected_logits = target_logits[~batch["kto_tags"]]
+        rejected_logps = target_logps[~batch["kto_tags"]]
+        chosen_logps_avg = target_logps_avg[batch["kto_tags"]]
+        return chosen_logps, rejected_logps, chosen_logits, rejected_logits, kl_logps, chosen_logps_avg
+
+    @override
+    def compute_reference_log_probs(
+        self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"]
+    ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        r"""Compute log probabilities of the reference model."""
+        if self.ref_model is None:
+            ref_model = model
+            ref_context = self.accelerator.unwrap_model(model).disable_adapter()
+        else:
+            ref_model = self.ref_model
+            ref_context = nullcontext()
+
+        with torch.no_grad(), ref_context:
+            reference_chosen_logps, reference_rejected_logps, _, _, reference_kl_logps, _ = self.concatenated_forward(
+                ref_model, batch
+            )
+
+        return reference_chosen_logps, reference_rejected_logps, reference_kl_logps
+
+    @override
+    def get_batch_loss_metrics(
+        self,
+        model: "PreTrainedModel",
+        batch: dict[str, "torch.Tensor"],
+    ) -> tuple["torch.Tensor", dict[str, "torch.Tensor"]]:
+        r"""Compute the DPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_kl_logps,
+            policy_chosen_logps_avg,
+        ) = self.concatenated_forward(model, batch)
+        reference_chosen_logps, reference_rejected_logps, reference_kl_logps = self.compute_reference_log_probs(
+            model, batch
+        )
+        losses, chosen_rewards, rejected_rewards, kl = self.kto_loss(
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_kl_logps,
+            reference_chosen_logps,
+            reference_rejected_logps,
+            reference_kl_logps,
+        )
+        losses = losses.nanmean()
+
+        if self.ftx_gamma > 1e-6 and len(policy_chosen_logps) > 0:  # remember to rescale
+            sft_loss = -policy_chosen_logps_avg
+            losses += self.ftx_gamma * sft_loss.nanmean() / len(policy_chosen_logps) * len(batch["labels"])
+
+        num_chosen = len(chosen_rewards)
+        num_rejected = len(rejected_rewards)
+        if num_chosen > 0:
+            metrics["rewards/chosen_sum"] = chosen_rewards.nansum().item()
+            metrics["logps/chosen_sum"] = policy_chosen_logps.nansum().item()
+            metrics["logits/chosen_sum"] = policy_chosen_logits.nansum().item()
+            metrics["count/chosen"] = float(num_chosen)
+
+        if num_rejected > 0:
+            metrics["rewards/rejected_sum"] = rejected_rewards.nansum().item()
+            metrics["logps/rejected_sum"] = policy_rejected_logps.nansum().item()
+            metrics["logits/rejected_sum"] = policy_rejected_logits.nansum().item()
+            metrics["count/rejected"] = float(num_rejected)
+
+        metrics["kl"] = kl.item()
+        return losses, metrics
+
+    @override
+    def compute_loss(
+        self, model: "PreTrainedModel", inputs: dict[str, "torch.Tensor"], return_outputs: bool = False, **kwargs
+    ) -> Union["torch.Tensor", tuple["torch.Tensor", list["torch.Tensor"]]]:
+        r"""Subclass and override to accept extra kwargs."""
+        return super().compute_loss(model, inputs, return_outputs)
+
+    @override
+    def log(self, logs: dict[str, float], *args, **kwargs) -> None:
+        r"""Log `logs` on the various objects watching training, including stored metrics."""
+        # logs either has "loss" or "eval_loss"
+        train_eval = "train" if "loss" in logs else "eval"
+        prefix = "eval_" if train_eval == "eval" else ""
+        # Add averaged stored metrics to logs
+        key_list, metric_list = [], []
+        for key, metrics in self._stored_metrics[train_eval].items():
+            key_list.append(key)
+            metric_list.append(torch.tensor(metrics, dtype=torch.float).to(self.accelerator.device).sum().item())
+
+        del self._stored_metrics[train_eval]
+        if len(metric_list) < 9:  # pad to for all reduce
+            for i in range(9 - len(metric_list)):
+                key_list.append(f"dummy_{i}")
+                metric_list.append(0.0)
+
+        metric_list = torch.tensor(metric_list, dtype=torch.float).to(self.accelerator.device)
+        metric_list = self.accelerator.reduce(metric_list, "sum").tolist()
+        metric_dict: dict[str, float] = dict(zip(key_list, metric_list))
+        for split in ["chosen", "rejected"]:  # accumulate average metrics from sums and lengths
+            if f"count/{split}" in metric_dict:
+                for key in ("rewards", "logps", "logits"):
+                    logs[f"{prefix}{key}/{split}"] = metric_dict[f"{key}/{split}_sum"] / metric_dict[f"count/{split}"]
+                    del metric_dict[f"{key}/{split}_sum"]
+                del metric_dict[f"count/{split}"]
+
+        if f"{prefix}rewards/chosen" in logs and f"{prefix}rewards/rejected" in logs:  # calculate reward margin
+            logs[f"{prefix}rewards/margins"] = logs[f"{prefix}rewards/chosen"] - logs[f"{prefix}rewards/rejected"]
+
+        for key, metric in metric_dict.items():  # add remaining items
+            if not key.startswith("dummy_"):
+                logs[key] = metric
+
+        return Trainer.log(self, logs, *args, **kwargs)
diff --git a/LlamaFactory/src/llamafactory/train/kto/workflow.py b/LlamaFactory/src/llamafactory/train/kto/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..df0794e3e986e0a0b55d667b255ee8c714fb8911
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/kto/workflow.py
@@ -0,0 +1,101 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/kto.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional
+
+from ...data import KTODataCollatorWithPadding, get_dataset, get_template_and_fix_tokenizer
+from ...extras.constants import IGNORE_INDEX
+from ...extras.ploting import plot_loss
+from ...hparams import ModelArguments
+from ...model import load_model, load_tokenizer
+from ..trainer_utils import create_modelcard_and_push, create_ref_model
+from .trainer import CustomKTOTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments
+
+
+def run_kto(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="kto", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+
+    data_collator = KTODataCollatorWithPadding(
+        template=template,
+        model=model,
+        pad_to_multiple_of=8,
+        label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id,
+        **tokenizer_module,
+    )
+
+    # Create reference model
+    if finetuning_args.ref_model is None and (not training_args.do_train):  # use the model itself
+        ref_model = model
+    else:
+        ref_model = create_ref_model(model_args, finetuning_args)
+
+    # Initialize our Trainer
+    trainer = CustomKTOTrainer(
+        model=model,
+        ref_model=ref_model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **dataset_module,
+        **tokenizer_module,
+    )
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            keys = ["loss", "rewards/chosen"]
+            if isinstance(dataset_module.get("eval_dataset"), dict):
+                keys += [f"eval_{key}_loss" for key in dataset_module["eval_dataset"].keys()]
+            else:
+                keys += ["eval_loss"]
+
+            plot_loss(training_args.output_dir, keys=keys)
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(metric_key_prefix="eval")
+        if id(model) == id(ref_model):  # unable to compute rewards without a reference model
+            remove_keys = [key for key in metrics.keys() if "rewards" in key]
+            for key in remove_keys:
+                metrics.pop(key)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/LlamaFactory/src/llamafactory/train/mca/__init__.py b/LlamaFactory/src/llamafactory/train/mca/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b3fb6eba7260ff5b6a01e19f9f05fc172a64df4
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/mca/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_dpo, run_pt, run_sft
+
+
+__all__ = ["run_dpo", "run_pt", "run_sft"]
diff --git a/LlamaFactory/src/llamafactory/train/mca/trainer.py b/LlamaFactory/src/llamafactory/train/mca/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..97cc9b71379826d10914a14b67d34df3f4baffa8
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/mca/trainer.py
@@ -0,0 +1,15 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO override the original trainer
diff --git a/LlamaFactory/src/llamafactory/train/mca/workflow.py b/LlamaFactory/src/llamafactory/train/mca/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..affa2efc8f3408a4281235f064e504916b670989
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/mca/workflow.py
@@ -0,0 +1,291 @@
+# Copyright 2025 the ROLL team and the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+from collections.abc import Sequence
+from copy import deepcopy
+from typing import TYPE_CHECKING, Any, Optional
+
+from transformers import DataCollatorForSeq2Seq
+
+from ...data import (
+    SFTDataCollatorWith4DAttentionMask,
+    get_dataset,
+    get_template_and_fix_tokenizer,
+)
+from ...data.collator import (
+    PairwiseDataCollatorWithPadding,
+)
+from ...extras.constants import IGNORE_INDEX, MCA_SUPPORTED_MODELS
+from ...extras.logging import get_logger
+from ...extras.misc import calculate_tps
+from ...extras.packages import is_mcore_adapter_available
+from ...extras.ploting import plot_loss
+from ...model import load_tokenizer
+from ..callbacks import SaveProcessorCallback
+
+
+if not is_mcore_adapter_available():
+    raise ImportError("mcore_adapter is not installed. Please install it with `pip install mcore-adapter`.")
+
+from mcore_adapter.models import AutoConfig, AutoModel
+from mcore_adapter.trainer import DPOTrainer as McaDPOTrainer
+from mcore_adapter.trainer import McaTrainer
+from mcore_adapter.trainer.dpo_config import DPOConfig
+
+
+if TYPE_CHECKING:
+    from mcore_adapter.training_args import Seq2SeqTrainingArguments as McaSeq2SeqTrainingArguments
+    from transformers import TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments, ModelArguments
+
+
+logger = get_logger(__name__)
+
+
+def _data_collator_wrapper(data_collator: Any):
+    @functools.wraps(data_collator)
+    def wrapper(features: Sequence[dict[str, Any]]):
+        labels_key = [k for k in features[0].keys() if k.endswith("labels")]
+        input_ids_key = [k for k in features[0].keys() if k.endswith("input_ids")]
+        for feature in features:
+            if len(labels_key) == 0:  # pt
+                feature["labels"] = deepcopy(feature["input_ids"])[1:]
+            for k in labels_key:
+                feature[k] = feature[k][1:]
+            for k in input_ids_key:
+                feature[k] = feature[k][:-1]
+            for k in ["attention_mask", "position_ids"]:
+                if k in feature:
+                    feature[k] = feature[k][:-1]
+        return data_collator(features)
+
+    return wrapper
+
+
+def _check_model_support(model_args: "ModelArguments"):
+    from transformers import AutoConfig as HfAutoConfig
+
+    config = HfAutoConfig.from_pretrained(
+        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+    )
+    if config.model_type not in MCA_SUPPORTED_MODELS:
+        raise ValueError(f"Model {config.model_type} is not supported by MCA.")
+
+
+def run_pt(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "McaSeq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+
+    # dataset needs +1 then cut back due to MCA shift logic
+    data_args.cutoff_len += 1
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="pt", **tokenizer_module)
+    data_args.cutoff_len -= 1
+
+    _check_model_support(model_args)
+    model = AutoModel.from_pretrained(model_args.model_name_or_path, training_args)
+    data_collator = DataCollatorForSeq2Seq(
+        tokenizer=tokenizer,
+        pad_to_multiple_of=8,
+        label_pad_token_id=IGNORE_INDEX,
+    )
+    data_collator = _data_collator_wrapper(data_collator)
+
+    trainer = McaTrainer(
+        model=model,
+        args=training_args,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **dataset_module,
+    )
+
+    if "processor" in tokenizer_module and tokenizer_module["processor"] is not None:
+        trainer.add_callback(SaveProcessorCallback(tokenizer_module["processor"]))
+
+    if training_args.do_train:
+        train_result = trainer.train(training_args.resume_from_checkpoint)
+        trainer.save_model()
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            keys = ["loss"]
+            if isinstance(dataset_module.get("eval_dataset"), dict):
+                keys += [f"eval_{key}_loss" for key in dataset_module["eval_dataset"].keys()]
+            else:
+                keys += ["eval_loss"]
+            plot_loss(training_args.output_dir, keys=keys)
+
+
+def run_sft(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "McaSeq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    # align packing flags
+    # TODO: FIX SequencePacking
+    data_args.neat_packing = training_args.sequence_packing = data_args.neat_packing or training_args.sequence_packing
+    data_args.packing = data_args.neat_packing or data_args.packing
+
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+
+    # dataset needs +1 then cut back due to MCA shift logic
+    data_args.cutoff_len += 1
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="sft", **tokenizer_module)
+    data_args.cutoff_len -= 1
+
+    _check_model_support(model_args)
+    model = AutoModel.from_pretrained(model_args.model_name_or_path, training_args)
+
+    # optional freezing for qwen2_vl, qwen2_5_vl
+    if getattr(model.config, "hf_model_type", None) in ["qwen2_vl", "qwen2_5_vl", "qwen3_vl"]:
+        params_to_freeze = []
+        if finetuning_args.freeze_vision_tower:
+            params_to_freeze.extend(["vision_model.blocks", "vision_model.patch_embed"])
+
+        if finetuning_args.freeze_multi_modal_projector:
+            params_to_freeze.extend(["multi_modal_projector"])
+
+        if finetuning_args.freeze_language_model:
+            params_to_freeze.extend(["embedding", "decoder", "output_layer"])
+
+        if params_to_freeze:
+            for name, p in model.named_parameters():
+                if any(name.startswith(k) for k in params_to_freeze):
+                    p.requires_grad_(False)
+
+    pad_to_max = training_args.expert_model_parallel_size is not None and training_args.expert_model_parallel_size > 1
+    data_collator = SFTDataCollatorWith4DAttentionMask(
+        template=template,
+        padding="max_length" if pad_to_max else "longest",
+        max_length=data_args.cutoff_len if pad_to_max else None,
+        pad_to_multiple_of=64,
+        label_pad_token_id=IGNORE_INDEX,
+        **tokenizer_module,
+    )
+    data_collator = _data_collator_wrapper(data_collator)
+
+    trainer = McaTrainer(
+        model=model,
+        args=training_args,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **dataset_module,
+    )
+
+    if "processor" in tokenizer_module and tokenizer_module["processor"] is not None:
+        trainer.add_callback(SaveProcessorCallback(tokenizer_module["processor"]))
+
+    train_result = trainer.train(training_args.resume_from_checkpoint)
+    trainer.save_model()
+    trainer.log_metrics("train", train_result.metrics)
+    trainer.save_metrics("train", train_result.metrics)
+    trainer.save_state()
+    if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+        keys = ["loss"]
+        if isinstance(dataset_module.get("eval_dataset"), dict):
+            keys += [f"eval_{key}_loss" for key in dataset_module["eval_dataset"].keys()]
+        else:
+            keys += ["eval_loss"]
+        plot_loss(training_args.output_dir, keys=keys)
+
+
+def run_dpo(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "McaSeq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+
+    _check_model_support(model_args)
+    model = AutoModel.from_pretrained(model_args.model_name_or_path, training_args)
+
+    if finetuning_args.use_ref_model:
+        ref_config = AutoConfig.from_pretrained(model_args.model_name_or_path, training_args)
+        ref_model = AutoModel.from_config(ref_config)
+        ref_model.load_state_dict(model.state_dict())
+    else:
+        ref_model = None
+
+    # dataset needs +1 then cut back due to MCA shift logic
+    data_args.cutoff_len += 1
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="rm", **tokenizer_module)
+    data_args.cutoff_len -= 1
+
+    pad_to_max = training_args.expert_model_parallel_size is not None and training_args.expert_model_parallel_size > 1
+    dpo_config = DPOConfig(
+        beta=finetuning_args.pref_beta,
+        pref_loss=finetuning_args.pref_loss,
+        label_smoothing=finetuning_args.dpo_label_smoothing,
+    )
+    data_collator = PairwiseDataCollatorWithPadding(
+        template=template,
+        pad_to_multiple_of=64,
+        padding="max_length" if pad_to_max else "longest",
+        max_length=data_args.cutoff_len if pad_to_max else None,
+        label_pad_token_id=IGNORE_INDEX,
+        **tokenizer_module,
+    )
+    data_collator = _data_collator_wrapper(data_collator)
+
+    trainer = McaDPOTrainer(
+        model=model,
+        ref_model=ref_model,
+        args=training_args,
+        train_config=dpo_config,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **dataset_module,
+    )
+
+    if "processor" in tokenizer_module and tokenizer_module["processor"] is not None:
+        trainer.add_callback(SaveProcessorCallback(tokenizer_module["processor"]))
+
+    train_result = trainer.train(training_args.resume_from_checkpoint)
+    trainer.save_model()
+    if finetuning_args.include_effective_tokens_per_second:
+        train_result.metrics["effective_tokens_per_sec"] = calculate_tps(
+            dataset_module["train_dataset"], train_result.metrics, stage="rm"
+        )
+
+    trainer.log_metrics("train", train_result.metrics)
+    trainer.save_metrics("train", train_result.metrics)
+    trainer.save_state()
+    if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+        keys = ["loss", "rewards/accuracies"]
+        if isinstance(dataset_module.get("eval_dataset"), dict):
+            keys += [f"eval_{key}_loss" for key in dataset_module["eval_dataset"].keys()]
+        else:
+            keys += ["eval_loss"]
+
+        plot_loss(training_args.output_dir, keys=keys)
diff --git a/LlamaFactory/src/llamafactory/train/ppo/__init__.py b/LlamaFactory/src/llamafactory/train/ppo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed9bc4d274d2b0a5cc16074858cd552348620ceb
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/ppo/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_ppo
+
+
+__all__ = ["run_ppo"]
diff --git a/LlamaFactory/src/llamafactory/train/ppo/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/ppo/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2bef9419fd8b4376385463c40028c698d29a6c7
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/ppo/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/ppo/__pycache__/ppo_utils.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/ppo/__pycache__/ppo_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5e9f15a9986667ee2dc4a75a0fa1097d3abe53c
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/ppo/__pycache__/ppo_utils.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/ppo/__pycache__/trainer.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/ppo/__pycache__/trainer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..619390fcb4fe3ba583201d643d0f48f990f17158
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/ppo/__pycache__/trainer.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/ppo/__pycache__/workflow.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/ppo/__pycache__/workflow.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc43708688e7250531111861b18ba9ed0fc505cc
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/ppo/__pycache__/workflow.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/ppo/ppo_utils.py b/LlamaFactory/src/llamafactory/train/ppo/ppo_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d462e77b74e88d66af8e60f3483786c11607bea
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/ppo/ppo_utils.py
@@ -0,0 +1,80 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from contextlib import nullcontext
+from typing import TYPE_CHECKING, Literal, Optional
+
+import torch
+from transformers.integrations import is_deepspeed_zero3_enabled
+
+from ...extras.packages import is_requests_available
+
+
+if is_requests_available():
+    import requests
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+    from trl import AutoModelForCausalLMWithValueHead
+
+
+def get_rewards_from_server(server_url: str, messages: list[str]) -> list["torch.Tensor"]:
+    r"""Get reward scores from the API server."""
+    headers = {"Content-Type": "application/json"}
+    payload = {"model": "model", "messages": messages}
+    response = requests.post(server_url, json=payload, headers=headers)
+    rewards = json.loads(response.text)["scores"]
+    return torch.Tensor(rewards)
+
+
+def replace_model(model: "AutoModelForCausalLMWithValueHead", target: Literal["default", "reward"]) -> None:
+    r"""Replace the default/reward modules in the model. The model is already unwrapped."""
+    v_head_layer = model.v_head.summary
+    if is_deepspeed_zero3_enabled():
+        import deepspeed  # type: ignore
+
+        params = [v_head_layer.weight, v_head_layer.bias]
+        context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0)
+    else:
+        context_maybe_zero3 = nullcontext()
+
+    model.pretrained_model.set_adapter(target)  # set the LoRA adapter to be active
+    with context_maybe_zero3:
+        if target == "reward":  # save default head temporarily
+            setattr(model, "default_head_weight", v_head_layer.weight.data.detach().clone())
+            setattr(model, "default_head_bias", v_head_layer.bias.data.detach().clone())
+
+        device = v_head_layer.weight.device
+        v_head_layer.weight.data = model.get_buffer(f"{target}_head_weight").detach().clone().to(device)
+        v_head_layer.bias.data = model.get_buffer(f"{target}_head_bias").detach().clone().to(device)
+
+
+def dump_layernorm(model: "PreTrainedModel") -> dict[str, "torch.Tensor"]:
+    r"""Dump the layernorm parameters in the model. The model is already unwrapped (and gathered)."""
+    layer_norm_params = {}
+    for name, param in model.named_parameters():
+        if param.data.dtype == torch.float32:
+            layer_norm_params[name] = param.data.detach().clone()
+            param.data = param.data.to(model.config.torch_dtype)
+
+    return layer_norm_params
+
+
+def restore_layernorm(model: "PreTrainedModel", layernorm_params: Optional[dict[str, "torch.Tensor"]] = None) -> None:
+    r"""Restore the layernorm parameters in the model. The model is already unwrapped (and gathered)."""
+    for name, param in model.named_parameters():
+        if name in layernorm_params:
+            param.data = layernorm_params[name]
diff --git a/LlamaFactory/src/llamafactory/train/ppo/trainer.py b/LlamaFactory/src/llamafactory/train/ppo/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaa74bb33966c9c3d50150d44006936512397a69
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/ppo/trainer.py
@@ -0,0 +1,518 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/ppo_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+import sys
+import warnings
+from types import MethodType
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from accelerate.utils import DistributedDataParallelKwargs
+from tqdm import tqdm
+from transformers import GenerationConfig, Trainer, TrainerControl, TrainerState
+from transformers.optimization import get_scheduler
+from transformers.trainer import DEFAULT_CALLBACKS
+from transformers.trainer_callback import CallbackHandler
+from transformers.trainer_pt_utils import remove_dummy_checkpoint
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
+from trl import PPOConfig, PPOTrainer
+from trl import __version__ as trl_version
+from trl.models.utils import unwrap_model_for_generation
+from typing_extensions import override
+
+from ...extras import logging
+from ...extras.misc import AverageMeter, count_parameters, get_current_device, get_logits_processor, torch_gc
+from ..callbacks import FixValueHeadModelCallback, SaveProcessorCallback
+from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
+from .ppo_utils import dump_layernorm, get_rewards_from_server, replace_model, restore_layernorm
+
+
+if TYPE_CHECKING:
+    from datasets import Dataset
+    from transformers import (
+        DataCollatorWithPadding,
+        PreTrainedTokenizer,
+        ProcessorMixin,
+        Seq2SeqTrainingArguments,
+        TrainerCallback,
+    )
+    from trl import AutoModelForCausalLMWithValueHead
+
+    from ...hparams import FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class CustomPPOTrainer(PPOTrainer, Trainer):
+    r"""Inherit PPOTrainer."""
+
+    def __init__(
+        self,
+        model_args: "ModelArguments",
+        training_args: "Seq2SeqTrainingArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+        callbacks: Optional[list["TrainerCallback"]],
+        model: "AutoModelForCausalLMWithValueHead",
+        reward_model: Optional["AutoModelForCausalLMWithValueHead"],
+        ref_model: Optional["AutoModelForCausalLMWithValueHead"],
+        tokenizer: "PreTrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
+        data_collator: "DataCollatorWithPadding",
+        train_dataset: Optional["Dataset"] = None,
+        eval_dataset: Optional["Dataset"] = None,
+    ) -> None:
+        if eval_dataset is not None:
+            raise NotImplementedError("PPOTrainer does not support eval dataset yet.")
+
+        # Check if TRL version is compatible (0.8.6 <= version <= 0.9.6)
+        try:
+            from transformers.utils.versions import require_version
+
+            require_version(
+                "trl>=0.8.6,<=0.9.6",
+                "Incompatible TRL version detected. LLaMA-Factory ppo requires TRL version >=0.8.6,<=0.9.6. "
+                f"Found version {trl_version}. Please install the correct version with: `pip install trl>=0.8.6,<=0.9.6`\n"
+                "To fix: run `DISABLE_VERSION_CHECK=1 llamafactory-cli train example_ppo.yaml`\n",
+            )
+        except ImportError as e:
+            raise e
+
+        backward_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
+        ppo_config = PPOConfig(
+            model_name=model_args.model_name_or_path,
+            learning_rate=training_args.learning_rate,
+            mini_batch_size=training_args.per_device_train_batch_size,
+            batch_size=backward_batch_size * finetuning_args.ppo_buffer_size,
+            gradient_accumulation_steps=training_args.gradient_accumulation_steps,
+            ppo_epochs=finetuning_args.ppo_epochs,
+            max_grad_norm=training_args.max_grad_norm,
+            seed=training_args.seed,
+            optimize_device_cache=True,
+            target=finetuning_args.ppo_target,
+            use_score_scaling=finetuning_args.ppo_score_norm,
+            use_score_norm=finetuning_args.ppo_score_norm,
+            whiten_rewards=finetuning_args.ppo_whiten_rewards,
+            accelerator_kwargs={"step_scheduler_with_optimizer": False},
+            log_with=training_args.report_to[0] if training_args.report_to else None,
+            project_kwargs={"logging_dir": training_args.logging_dir},
+        )
+
+        # Add deepspeed config
+        if training_args.deepspeed_plugin is not None:
+            ppo_config.accelerator_kwargs["kwargs_handlers"] = [
+                DistributedDataParallelKwargs(find_unused_parameters=training_args.ddp_find_unused_parameters)
+            ]
+            ppo_config.accelerator_kwargs["deepspeed_plugin"] = training_args.deepspeed_plugin
+            if ppo_config.log_with is not None:
+                logger.warning_rank0("PPOTrainer cannot use external logger when DeepSpeed is enabled.")
+                ppo_config.log_with = None
+
+        # Create optimizer and scheduler
+        if training_args.max_steps > 0:
+            num_training_steps = training_args.max_steps
+        else:
+            total_train_batch_size = backward_batch_size * finetuning_args.ppo_buffer_size * training_args.world_size
+            num_training_steps = training_args.num_train_epochs * math.ceil(
+                len(train_dataset) / total_train_batch_size
+            )
+
+        optimizer = self.create_optimizer(model, training_args, finetuning_args)
+        scheduler = self.create_scheduler(training_args, num_training_steps, optimizer)
+
+        PPOTrainer.__init__(
+            self,
+            config=ppo_config,
+            model=model,
+            ref_model=ref_model,
+            tokenizer=tokenizer,
+            dataset=train_dataset,
+            optimizer=optimizer,
+            data_collator=data_collator,
+            lr_scheduler=scheduler,
+        )
+
+        self.args = training_args
+        self.model_args = model_args
+        self.finetuning_args = finetuning_args
+        self.reward_model = reward_model
+        self.current_device = get_current_device()  # patch for deepspeed training
+
+        self.generation_config = GenerationConfig(
+            pad_token_id=self.tokenizer.pad_token_id,
+            eos_token_id=[self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids,
+            **generating_args.to_dict(),
+        )
+
+        self.state = TrainerState()
+        self.control = TrainerControl()
+        self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
+        self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
+        callbacks = DEFAULT_CALLBACKS if callbacks is None else DEFAULT_CALLBACKS + callbacks
+        self.callback_handler = CallbackHandler(
+            callbacks, self.accelerator.unwrap_model(self.model), self.tokenizer, self.optimizer, self.lr_scheduler
+        )
+        if self.args.max_steps > 0:
+            logger.info_rank0("max_steps is given, it will override any value given in num_train_epochs")
+
+        self.amp_context = torch.autocast(self.current_device.type)
+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
+
+        if finetuning_args.reward_model_type == "full":
+            if self.is_deepspeed_enabled:
+                if not (
+                    getattr(reward_model.pretrained_model, "is_loaded_in_8bit", False)
+                    or getattr(reward_model.pretrained_model, "is_loaded_in_4bit", False)
+                ):  # quantized models are already set on the correct device
+                    self.reward_model = self._prepare_deepspeed(self.reward_model)
+            else:
+                self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
+
+        self.add_callback(FixValueHeadModelCallback)
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
+        if finetuning_args.use_badam:
+            from badam import BAdamCallback, clip_grad_norm_old_version  # type: ignore
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.add_callback(BAdamCallback)
+
+    def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None:
+        r"""Implement training loop for the PPO stage, like _inner_training_loop() in Huggingface's Trainer."""
+        if resume_from_checkpoint is not None:
+            raise ValueError("`resume_from_checkpoint` will be supported in the future version.")
+
+        total_train_batch_size = (
+            self.args.per_device_train_batch_size
+            * self.args.gradient_accumulation_steps
+            * self.finetuning_args.ppo_buffer_size
+            * self.args.world_size
+        )
+        if self.args.max_steps > 0:
+            num_examples = total_train_batch_size * self.args.max_steps
+            num_train_epochs = sys.maxsize
+            max_steps = self.args.max_steps
+            steps_in_epoch = self.args.max_steps
+        else:
+            len_dataloader = len(self.dataloader)
+            num_examples = len(self.dataset)
+            num_train_epochs = self.args.num_train_epochs
+            max_steps = math.ceil(num_train_epochs * len_dataloader)
+            steps_in_epoch = len_dataloader
+
+        self.state.max_steps = max_steps
+        self.state.num_train_epochs = num_train_epochs
+        self.state.is_local_process_zero = self.is_local_process_zero()
+        self.state.is_world_process_zero = self.is_world_process_zero()
+
+        logger.info_rank0("***** Running training *****")
+        logger.info_rank0(f"  Num examples = {num_examples:,}")
+        logger.info_rank0(f"  Num Epochs = {num_train_epochs:,}")
+        logger.info_rank0(f"  Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
+        logger.info_rank0(
+            f"  Total train batch size (w. parallel, buffer, distributed & accumulation) = {total_train_batch_size:,}"
+        )
+        logger.info_rank0(f"  Gradient Accumulation steps = {self.args.gradient_accumulation_steps:,}")
+        logger.info_rank0(f"  Num optimization epochs per batch = {self.finetuning_args.ppo_epochs:,}")
+        logger.info_rank0(f"  Total training steps = {max_steps:,}")
+        logger.info_rank0(f"  Number of trainable parameters = {count_parameters(self.model)[0]:,}")
+
+        dataiter = iter(self.dataloader)
+        loss_meter = AverageMeter()
+        reward_meter = AverageMeter()
+        self.callback_handler.on_train_begin(self.args, self.state, self.control)
+
+        for step in tqdm(range(max_steps), disable=not self.is_local_process_zero()):
+            try:
+                batch = next(dataiter)
+            except StopIteration:
+                dataiter = iter(self.dataloader)
+                batch = next(dataiter)
+
+            # Get inputs
+            self.model.eval()
+            self.tokenizer.padding_side = "right"  # change padding side
+            queries, responses, rewards = [], [], []
+            for idx in range(0, self.config.batch_size, self.config.mini_batch_size):
+                mini_batch = {
+                    "input_ids": batch["input_ids"][idx : idx + self.config.mini_batch_size],
+                    "attention_mask": batch["attention_mask"][idx : idx + self.config.mini_batch_size],
+                }
+                mini_batch_queries, mini_batch_responses = self.get_inputs(mini_batch)
+                mini_batch_rewards = self.get_rewards(mini_batch_queries, mini_batch_responses)
+                queries.extend(mini_batch_queries)
+                responses.extend(mini_batch_responses)
+                rewards.extend(mini_batch_rewards)
+
+            # Run PPO step
+            self.model.train()
+            stats = self.step(queries, responses, rewards)
+            self.tokenizer.padding_side = "left"  # restore padding side
+            loss_meter.update(float(stats["ppo/loss/total"]), n=len(rewards))
+            reward_meter.update(torch.stack(rewards).mean().item(), n=len(rewards))
+
+            if self.config.log_with is not None:
+                try:
+                    batch["query"] = self.tokenizer.batch_decode(queries, skip_special_tokens=True)
+                    batch["response"] = self.tokenizer.batch_decode(responses, skip_special_tokens=True)
+                    self.log_stats(stats, batch, rewards)
+                except Exception:
+                    logger.warning_rank0("Failed to save stats due to unknown errors.")
+
+            self.state.global_step += 1
+            self.callback_handler.on_step_end(self.args, self.state, self.control)
+
+            if self.is_local_process_zero() and (step + 1) % self.args.logging_steps == 0:
+                logs = dict(
+                    loss=round(loss_meter.avg, 4),
+                    reward=round(reward_meter.avg, 4),
+                    learning_rate=stats["ppo/learning_rate"],
+                    epoch=round(step / steps_in_epoch, 2),
+                )
+                tqdm.write(str(logs))
+                logs["step"] = step
+                self.state.log_history.append(logs)
+                self.callback_handler.on_log(self.args, self.state, self.control, logs)
+                loss_meter.reset()
+                reward_meter.reset()
+
+            if (step + 1) % self.args.save_steps == 0:  # save checkpoint
+                self.save_model(
+                    os.path.join(self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}")
+                )
+                self.callback_handler.on_save(self.args, self.state, self.control)
+
+            if self.control.should_epoch_stop or self.control.should_training_stop:
+                break
+
+        self.callback_handler.on_train_end(self.args, self.state, self.control)
+
+    @override
+    def create_optimizer(
+        self,
+        model: "AutoModelForCausalLMWithValueHead",
+        training_args: "Seq2SeqTrainingArguments",
+        finetuning_args: "FinetuningArguments",
+    ) -> "torch.optim.Optimizer":
+        optimizer = create_custom_optimizer(model, training_args, finetuning_args)
+        if optimizer is None:
+            decay_params, nodecay_params = [], []
+            decay_param_names = self.get_decay_parameter_names(model)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if name in decay_param_names:
+                        decay_params.append(param)
+                    else:
+                        nodecay_params.append(param)
+
+            optim_class, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+            param_groups = [
+                dict(params=nodecay_params),
+                dict(params=decay_params, weight_decay=training_args.weight_decay),
+            ]
+            optimizer = optim_class(param_groups, **optim_kwargs)
+
+        return optimizer
+
+    @override
+    def create_scheduler(
+        self, training_args: "Seq2SeqTrainingArguments", num_training_steps: int, optimizer: "torch.optim.Optimizer"
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(training_args, num_training_steps, optimizer)
+        lr_scheduler = get_scheduler(
+            training_args.lr_scheduler_type,
+            optimizer=optimizer,
+            num_warmup_steps=training_args.get_warmup_steps(num_training_steps),
+            num_training_steps=num_training_steps,
+        )
+        return lr_scheduler
+
+    @torch.no_grad()
+    def get_inputs(self, batch: dict[str, "torch.Tensor"]) -> tuple[list["torch.Tensor"], list["torch.Tensor"]]:
+        r"""Generate model's responses given queries."""
+        if batch["input_ids"].size(0) == 1:  # handle llama2 ppo with gradient accumulation > 1
+            start_index = (batch["input_ids"][0] != self.tokenizer.pad_token_id).nonzero()[0].item()
+            for k, v in batch.items():
+                batch[k] = v[:, start_index:]
+
+        with unwrap_model_for_generation(self.model, self.accelerator) as unwrapped_model:
+            unwrapped_model: AutoModelForCausalLMWithValueHead = self.accelerator.unwrap_model(self.model)
+            if self.model_args.upcast_layernorm:
+                layernorm_params = dump_layernorm(unwrapped_model)
+
+            generate_output: torch.Tensor = unwrapped_model.generate(
+                generation_config=self.generation_config, logits_processor=get_logits_processor(), **batch
+            )
+            if self.model_args.upcast_layernorm:
+                restore_layernorm(unwrapped_model, layernorm_params)
+
+        query = batch["input_ids"].detach().cpu()
+        response = generate_output[:, batch["input_ids"].size(-1) :].detach().cpu()
+        queries, responses = [], []
+        for i in range(len(query)):
+            query_start_index = (query[i] != self.tokenizer.pad_token_id).nonzero()[0].item()
+            response_indexes = (response[i] != self.tokenizer.pad_token_id).nonzero()
+
+            if len(response_indexes) == 0:  # allow empty response
+                response_length = 1
+            elif self.tokenizer.eos_token_id == self.tokenizer.pad_token_id:  # include eos token
+                response_length = response_indexes[-1].item() + 2
+            else:
+                response_length = response_indexes[-1].item() + 1
+
+            queries.append(query[i, query_start_index:])  # remove padding from left
+            responses.append(response[i, :response_length])  # remove padding from right
+
+        return queries, responses
+
+    @torch.no_grad()
+    def get_rewards(
+        self,
+        queries: list["torch.Tensor"],
+        responses: list["torch.Tensor"],
+    ) -> list["torch.Tensor"]:
+        r"""Compute scores using given reward model.
+
+        Both inputs and outputs are put on CPU.
+        """
+        if self.finetuning_args.reward_model_type == "api":
+            token_ids = [torch.cat((q, r), dim=-1).tolist() for q, r in zip(queries, responses)]
+            messages = self.tokenizer.batch_decode(token_ids, skip_special_tokens=False)
+            return get_rewards_from_server(self.reward_model, messages)
+
+        batch: dict[str, torch.Tensor] = self.prepare_model_inputs(queries, responses)
+        unwrapped_model: AutoModelForCausalLMWithValueHead = self.accelerator.unwrap_model(self.model)
+
+        if self.finetuning_args.reward_model_type in ["lora", "oft"]:
+            replace_model(unwrapped_model, target="reward")
+            reward_model = self.model
+        else:
+            reward_model = self.reward_model
+
+        with unwrap_model_for_generation(reward_model, self.accelerator), self.amp_context:  # support bf16
+            values: torch.Tensor = reward_model(**batch, return_dict=True, use_cache=False)[-1]
+
+        if self.finetuning_args.reward_model_type in ["lora", "oft"]:
+            replace_model(unwrapped_model, target="default")
+
+        rewards = values.gather(dim=-1, index=(batch["attention_mask"].sum(dim=-1, keepdim=True) - 1))
+        return rewards.float().detach()  # use fp32 type
+
+    @override
+    def batched_forward_pass(
+        self,
+        model: "AutoModelForCausalLMWithValueHead",
+        queries: "torch.Tensor",
+        responses: "torch.Tensor",
+        model_inputs: dict[str, Any],
+        return_logits: bool = False,
+        response_masks: Optional["torch.Tensor"] = None,
+    ) -> tuple["torch.Tensor", Optional["torch.Tensor"], "torch.Tensor", "torch.Tensor"]:
+        r"""Calculate model outputs in multiple batches.
+
+        Subclass and override to inject custom behavior.
+        """
+        from trl.core import logprobs_from_logits
+
+        torch_gc()
+        bs = len(queries)
+        fbs = self.config.mini_batch_size
+        all_logprobs = []
+        all_logits = []
+        all_masks = []
+        all_values = []
+
+        for i in range(math.ceil(bs / fbs)):
+            input_kwargs = {key: value[i * fbs : (i + 1) * fbs] for key, value in model_inputs.items()}
+            query_batch = queries[i * fbs : (i + 1) * fbs]
+            response_batch = responses[i * fbs : (i + 1) * fbs]
+            if response_masks is not None:
+                response_masks_batch = response_masks[i * fbs : (i + 1) * fbs]
+            input_ids = input_kwargs["input_ids"]
+            attention_mask = input_kwargs["attention_mask"]
+
+            with self.amp_context:  # support bf16
+                logits, _, values = model(**input_kwargs, return_dict=True, use_cache=False)
+
+            logprobs = logprobs_from_logits(logits[:, :-1, :], input_ids[:, 1:])
+            masks = torch.zeros_like(attention_mask)
+            masks[:, :-1] = attention_mask[:, 1:]
+
+            for j in range(len(query_batch)):
+                start = len(query_batch[j]) - 1
+                if attention_mask[j, 0] == 0:  # offset left padding
+                    start += attention_mask[j, :].nonzero()[0].item()
+                end = start + len(response_batch[j])
+
+                if response_masks is not None:
+                    response_masks_batch = torch.cat((torch.zeros_like(query_batch[j]), response_masks_batch[j]))[1:]
+
+                masks[j, :start] = 0
+                masks[j, end:] = 0
+                if response_masks is not None:
+                    masks[j, start:end] = masks[j, start:end] * response_masks_batch[j][start:end]
+
+            if return_logits:
+                all_logits.append(logits)
+            else:
+                del logits
+
+            all_values.append(values)
+            all_logprobs.append(logprobs)
+            all_masks.append(masks)
+
+        return (
+            torch.cat(all_logprobs),
+            torch.cat(all_logits)[:, :-1] if return_logits else None,
+            torch.cat(all_values)[:, :-1],
+            torch.cat(all_masks)[:, :-1],
+        )
+
+    @override
+    def save_model(self, output_dir: Optional[str] = None) -> None:
+        r"""Save model checkpoint.
+
+        Subclass and override to inject custom behavior.
+        """
+        if output_dir is None:
+            output_dir = self.args.output_dir
+
+        if self.is_fsdp_enabled or self.is_deepspeed_enabled:
+            try:
+                state_dict = self.accelerator.get_state_dict(self.model)  # must be called at all ranks
+                if self.args.should_save:
+                    self._save(output_dir, state_dict=state_dict)
+            except ValueError:
+                logger.warning_rank0(
+                    " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead,"
+                    " use zero_to_fp32.py to recover weights"
+                )
+                if self.args.should_save:
+                    self._save(output_dir, state_dict={})
+                # remove the dummy state_dict
+                remove_dummy_checkpoint(self.args.should_save, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME])
+                self.model.save_checkpoint(output_dir)
+
+        elif self.args.should_save:
+            unwrapped_model: AutoModelForCausalLMWithValueHead = self.accelerator.unwrap_model(self.model)
+            self._save(output_dir, state_dict=unwrapped_model.state_dict())
diff --git a/LlamaFactory/src/llamafactory/train/ppo/workflow.py b/LlamaFactory/src/llamafactory/train/ppo/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..282a2f683672c2047dca4ce8362622e6e48c04aa
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/ppo/workflow.py
@@ -0,0 +1,79 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/ppo.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional
+
+from ...data import MultiModalDataCollatorForSeq2Seq, get_dataset, get_template_and_fix_tokenizer
+from ...extras.ploting import plot_loss
+from ...model import load_model, load_tokenizer
+from ..callbacks import fix_valuehead_checkpoint
+from ..trainer_utils import create_ref_model, create_reward_model
+from .trainer import CustomPPOTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+def run_ppo(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    generating_args: "GeneratingArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="ppo", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train, add_valuehead=True)
+
+    tokenizer.padding_side = "left"  # use left-padding in generation while using right-padding in training
+    data_collator = MultiModalDataCollatorForSeq2Seq(template=template, model=model, **tokenizer_module)
+
+    # Create reference model and reward model
+    ref_model = create_ref_model(model_args, finetuning_args, add_valuehead=True)
+    reward_model = create_reward_model(model, model_args, finetuning_args)
+
+    # Initialize our Trainer
+    ppo_trainer: CustomPPOTrainer = CustomPPOTrainer(
+        model_args=model_args,
+        training_args=training_args,
+        finetuning_args=finetuning_args,
+        generating_args=generating_args,
+        callbacks=callbacks,
+        model=model,
+        reward_model=reward_model,
+        ref_model=ref_model,
+        data_collator=data_collator,
+        **dataset_module,
+        **tokenizer_module,
+    )
+
+    # Training
+    if training_args.do_train:
+        ppo_trainer.ppo_train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        ppo_trainer.save_model()
+        if training_args.should_save:
+            fix_valuehead_checkpoint(model, training_args.output_dir, getattr(training_args, "save_safetensors", True))
+
+        ppo_trainer.save_state()  # must be called after save_model to have a folder
+        if ppo_trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            plot_loss(training_args.output_dir, keys=["loss", "reward"])
diff --git a/LlamaFactory/src/llamafactory/train/pt/__init__.py b/LlamaFactory/src/llamafactory/train/pt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f5c2898372d7dc2563472741fe76bba04de5479
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/pt/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_pt
+
+
+__all__ = ["run_pt"]
diff --git a/LlamaFactory/src/llamafactory/train/pt/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/pt/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0a5ddf0d58ebe82b279f782d3b0ba7c40e0bf07
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/pt/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/pt/__pycache__/trainer.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/pt/__pycache__/trainer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ac8ffc68952d34cd49d769b79131e937fb94555
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/pt/__pycache__/trainer.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/pt/__pycache__/workflow.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/pt/__pycache__/workflow.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e080d7b16364d55cc665129c051afef8fcf2ce77
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/pt/__pycache__/workflow.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/pt/trainer.py b/LlamaFactory/src/llamafactory/train/pt/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a4bef3dd3db6800a8016ef5cc16014c61e5931c
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/pt/trainer.py
@@ -0,0 +1,93 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from types import MethodType
+from typing import TYPE_CHECKING, Optional
+
+import torch
+from transformers import Trainer
+from typing_extensions import override
+
+from ..callbacks import SaveProcessorCallback
+from ..fp8_utils import configure_fp8_environment, patch_accelerator_for_fp8, verify_fp8_status
+from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
+
+
+if TYPE_CHECKING:
+    from transformers import ProcessorMixin
+
+    from ...hparams import FinetuningArguments, ModelArguments, TrainingArguments
+
+
+class CustomTrainer(Trainer):
+    r"""Inherit Trainer for custom optimizer."""
+
+    def __init__(
+        self,
+        finetuning_args: "FinetuningArguments",
+        processor: Optional["ProcessorMixin"],
+        model_args: Optional["ModelArguments"] = None,
+        **kwargs,
+    ) -> None:
+        kwargs["processing_class"] = kwargs.pop("tokenizer")
+        # Configure FP8 environment if enabled
+        training_args: TrainingArguments = kwargs.get("args")
+        if training_args.fp8:
+            configure_fp8_environment(training_args)
+            if getattr(training_args, "fp8_backend", "auto") == "te":
+                patch_accelerator_for_fp8()
+
+        super().__init__(**kwargs)
+        if processor is not None:
+            # avoid wrong loss under gradient accumulation
+            # https://github.com/huggingface/transformers/pull/36044#issuecomment-2746657112
+            self.model_accepts_loss_kwargs = False
+
+        self.finetuning_args = finetuning_args
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
+        if finetuning_args.use_badam:
+            from badam import BAdamCallback, clip_grad_norm_old_version  # type: ignore
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.add_callback(BAdamCallback)
+
+        if training_args.fp8 and hasattr(self, "accelerator"):  # verify FP8 status after trainer initialization
+            verify_fp8_status(self.accelerator, training_args)
+
+    @override
+    def create_optimizer(self) -> "torch.optim.Optimizer":
+        if self.optimizer is None:
+            self.optimizer = create_custom_optimizer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
+
+    @override
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
+
+    @override
+    def _get_train_sampler(self, *args, **kwargs) -> Optional["torch.utils.data.Sampler"]:
+        if self.finetuning_args.disable_shuffling:
+            return torch.utils.data.SequentialSampler(self.train_dataset)
+
+        return super()._get_train_sampler(*args, **kwargs)
+
+    @override
+    def compute_loss(self, model, inputs, *args, **kwargs):
+        return super().compute_loss(model, inputs, *args, **kwargs)
diff --git a/LlamaFactory/src/llamafactory/train/pt/workflow.py b/LlamaFactory/src/llamafactory/train/pt/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ea604bf76dc05cade1810c230791cb654e7056
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/pt/workflow.py
@@ -0,0 +1,101 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import TYPE_CHECKING, Optional
+
+from transformers import DataCollatorForLanguageModeling
+
+from ...data import get_dataset, get_template_and_fix_tokenizer
+from ...extras.ploting import plot_loss
+from ...model import load_model, load_tokenizer
+from ..trainer_utils import create_modelcard_and_push
+from .trainer import CustomTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments, ModelArguments
+
+
+def run_pt(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="pt", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+
+    # Initialize our Trainer
+    trainer = CustomTrainer(
+        model=model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **dataset_module,
+        **tokenizer_module,
+    )
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            keys = ["loss"]
+            if isinstance(dataset_module.get("eval_dataset"), dict):
+                keys += [f"eval_{key}_loss" for key in dataset_module["eval_dataset"].keys()]
+            else:
+                keys += ["eval_loss"]
+
+            plot_loss(training_args.output_dir, keys=keys)
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(metric_key_prefix="eval")
+
+        if isinstance(dataset_module.get("eval_dataset"), dict):
+            for key in dataset_module["eval_dataset"].keys():
+                try:
+                    perplexity = math.exp(metrics[f"eval_{key}_loss"])
+                except OverflowError:
+                    perplexity = float("inf")
+
+                metrics[f"eval_{key}_perplexity"] = perplexity
+        else:
+            try:
+                perplexity = math.exp(metrics["eval_loss"])
+            except OverflowError:
+                perplexity = float("inf")
+
+            metrics["eval_perplexity"] = perplexity
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/LlamaFactory/src/llamafactory/train/rm/__init__.py b/LlamaFactory/src/llamafactory/train/rm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0e8a45c0f6a4e426c459f0d3e353b8b5e3ebce7
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/rm/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_rm
+
+
+__all__ = ["run_rm"]
diff --git a/LlamaFactory/src/llamafactory/train/rm/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/rm/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cf7cf0aaedd0316f9a2cdbf17e62038b2530562
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/rm/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/rm/__pycache__/metric.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/rm/__pycache__/metric.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f7da582a916789c9e08cb8ea601af75faf3d0d8
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/rm/__pycache__/metric.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/rm/__pycache__/trainer.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/rm/__pycache__/trainer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d9f157601923eaff2f4b1a4b11afa35edb8ced8
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/rm/__pycache__/trainer.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/rm/__pycache__/workflow.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/rm/__pycache__/workflow.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34df3a0e2cd3f9284eb6cfaa8469dd6259e7e187
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/rm/__pycache__/workflow.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/rm/metric.py b/LlamaFactory/src/llamafactory/train/rm/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae334cd9a27540ef07050161b22714947b7a4c8b
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/rm/metric.py
@@ -0,0 +1,51 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from ...extras.misc import numpify
+
+
+if TYPE_CHECKING:
+    from transformers import EvalPrediction
+
+
+@dataclass
+class ComputeAccuracy:
+    r"""Compute reward accuracy and support `batch_eval_metrics`."""
+
+    def _dump(self) -> dict[str, float] | None:
+        result = None
+        if hasattr(self, "score_dict"):
+            result = {k: float(np.mean(v)) for k, v in self.score_dict.items()}
+
+        self.score_dict = {"accuracy": []}
+        return result
+
+    def __post_init__(self):
+        self._dump()
+
+    def __call__(self, eval_preds: "EvalPrediction", compute_result: bool = True) -> dict[str, float] | None:
+        chosen_scores, rejected_scores = numpify(eval_preds.predictions[0]), numpify(eval_preds.predictions[1])
+        if not chosen_scores.shape:
+            self.score_dict["accuracy"].append(chosen_scores > rejected_scores)
+        else:
+            for i in range(len(chosen_scores)):
+                self.score_dict["accuracy"].append(chosen_scores[i] > rejected_scores[i])
+
+        if compute_result:
+            return self._dump()
diff --git a/LlamaFactory/src/llamafactory/train/rm/trainer.py b/LlamaFactory/src/llamafactory/train/rm/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0384681b187748685a6f200a65b62d944e6fcf5
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/rm/trainer.py
@@ -0,0 +1,150 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from types import MethodType
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+from transformers import Trainer
+from typing_extensions import override
+
+from ...extras import logging
+from ...extras.packages import is_transformers_version_greater_than
+from ..callbacks import FixValueHeadModelCallback, SaveProcessorCallback
+from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, ProcessorMixin
+    from transformers.trainer import PredictionOutput
+
+    from ...hparams import FinetuningArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class PairwiseTrainer(Trainer):
+    r"""Inherits Trainer to compute pairwise loss."""
+
+    def __init__(
+        self, finetuning_args: "FinetuningArguments", processor: Optional["ProcessorMixin"], **kwargs
+    ) -> None:
+        if is_transformers_version_greater_than("4.46"):
+            kwargs["processing_class"] = kwargs.pop("tokenizer")
+
+        super().__init__(**kwargs)
+        self.model_accepts_loss_kwargs = False  # overwrite trainer's default behavior
+        self.finetuning_args = finetuning_args
+        self.can_return_loss = True  # override property to return eval_loss
+        self.add_callback(FixValueHeadModelCallback)
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
+        if finetuning_args.use_badam:
+            from badam import BAdamCallback, clip_grad_norm_old_version  # type: ignore
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.add_callback(BAdamCallback)
+
+    @override
+    def create_optimizer(self) -> "torch.optim.Optimizer":
+        if self.optimizer is None:
+            self.optimizer = create_custom_optimizer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
+
+    @override
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
+
+    @override
+    def _get_train_sampler(self, *args, **kwargs) -> Optional["torch.utils.data.Sampler"]:
+        if self.finetuning_args.disable_shuffling:
+            return torch.utils.data.SequentialSampler(self.train_dataset)
+
+        return super()._get_train_sampler(*args, **kwargs)
+
+    @override
+    def compute_loss(
+        self, model: "PreTrainedModel", inputs: dict[str, "torch.Tensor"], return_outputs: bool = False, **kwargs
+    ) -> Union["torch.Tensor", tuple["torch.Tensor", list["torch.Tensor"]]]:
+        r"""Compute pairwise loss. The first n examples are chosen and the last n examples are rejected.
+
+        Subclass and override to inject custom behavior.
+
+        Note that the first element will be removed from the output tuple.
+        See: https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer.py#L3842
+        """
+        _, _, values = model(**inputs, output_hidden_states=True, return_dict=True, use_cache=False)
+        batch_size = inputs["input_ids"].size(0) // 2
+        chosen_masks, rejected_masks = torch.split(inputs["attention_mask"], batch_size, dim=0)
+        chosen_rewards, rejected_rewards = torch.split(values, batch_size, dim=0)
+        chosen_scores = chosen_rewards.gather(dim=-1, index=(chosen_masks.sum(dim=-1, keepdim=True) - 1))
+        rejected_scores = rejected_rewards.gather(dim=-1, index=(rejected_masks.sum(dim=-1, keepdim=True) - 1))
+        chosen_scores, rejected_scores = chosen_scores.squeeze(), rejected_scores.squeeze()
+
+        loss = -torch.nn.functional.logsigmoid(chosen_scores.float() - rejected_scores.float()).mean()
+        if return_outputs:
+            return loss, (loss, chosen_scores, rejected_scores)
+        else:
+            return loss
+
+    @override
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if state_dict is None:
+            state_dict = self.model.state_dict()
+
+        if getattr(self.args, "save_safetensors", True):
+            from collections import defaultdict
+
+            ptrs = defaultdict(list)
+            for name, tensor in state_dict.items():
+                if isinstance(tensor, torch.Tensor):
+                    ptrs[id(tensor)].append(name)
+
+            for names in ptrs.values():
+                if len(names) > 1:
+                    names.sort()
+                    for name in names[1:]:
+                        state_dict.pop(name, None)
+
+        super()._save(output_dir, state_dict)
+
+    def save_predictions(self, predict_results: "PredictionOutput") -> None:
+        r"""Save model predictions to `output_dir`.
+
+        A custom behavior that not contained in Seq2SeqTrainer.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl")
+        logger.info_rank0(f"Saving prediction results to {output_prediction_file}")
+        chosen_scores, rejected_scores = predict_results.predictions
+
+        with open(output_prediction_file, "w", encoding="utf-8") as writer:
+            res: list[str] = []
+            for c_score, r_score in zip(chosen_scores, rejected_scores):
+                res.append(json.dumps({"chosen": round(float(c_score), 2), "rejected": round(float(r_score), 2)}))
+
+            writer.write("\n".join(res))
diff --git a/LlamaFactory/src/llamafactory/train/rm/workflow.py b/LlamaFactory/src/llamafactory/train/rm/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..326561c462bc490d845117a84f234dd194a12a72
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/rm/workflow.py
@@ -0,0 +1,98 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional
+
+from ...data import PairwiseDataCollatorWithPadding, get_dataset, get_template_and_fix_tokenizer
+from ...extras.ploting import plot_loss
+from ...model import load_model, load_tokenizer
+from ..callbacks import fix_valuehead_checkpoint
+from ..trainer_utils import create_modelcard_and_push
+from .metric import ComputeAccuracy
+from .trainer import PairwiseTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments, ModelArguments
+
+
+def run_rm(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="rm", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train, add_valuehead=True)
+    data_collator = PairwiseDataCollatorWithPadding(
+        template=template, model=model, pad_to_multiple_of=8, **tokenizer_module
+    )
+
+    # Initialize our Trainer
+    trainer = PairwiseTrainer(
+        model=model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        compute_metrics=ComputeAccuracy(),
+        **dataset_module,
+        **tokenizer_module,
+    )
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        if training_args.should_save:
+            fix_valuehead_checkpoint(model, training_args.output_dir, getattr(training_args, "save_safetensors", True))
+
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            keys = ["loss"]
+            if isinstance(dataset_module.get("eval_dataset"), dict):
+                keys += sum(
+                    [[f"eval_{key}_loss", f"eval_{key}_accuracy"] for key in dataset_module["eval_dataset"].keys()], []
+                )
+            else:
+                keys += ["eval_loss", "eval_accuracy"]
+
+            plot_loss(training_args.output_dir, keys=keys)
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(metric_key_prefix="eval")
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Predict
+    if training_args.do_predict:
+        predict_results = trainer.predict(dataset_module["eval_dataset"], metric_key_prefix="predict")
+        trainer.log_metrics("predict", predict_results.metrics)
+        trainer.save_metrics("predict", predict_results.metrics)
+        trainer.save_predictions(predict_results)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/LlamaFactory/src/llamafactory/train/sft/__init__.py b/LlamaFactory/src/llamafactory/train/sft/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6107a9ae741be83e0b3038015316f5ca7510fa76
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/sft/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_sft
+
+
+__all__ = ["run_sft"]
diff --git a/LlamaFactory/src/llamafactory/train/sft/__pycache__/__init__.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/sft/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3456f49b5a9aabfdbabd662e77a48b5ce048bdd
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/sft/__pycache__/__init__.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/sft/__pycache__/metric.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/sft/__pycache__/metric.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fba9c90e8138f58b935fdf764c360010e96c0d20
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/sft/__pycache__/metric.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/sft/__pycache__/trainer.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/sft/__pycache__/trainer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc42759a5aafaa54b42e4ae527545c37caf6525b
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/sft/__pycache__/trainer.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/sft/__pycache__/workflow.cpython-311.pyc b/LlamaFactory/src/llamafactory/train/sft/__pycache__/workflow.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11378179b773d3800627f39591fa2e2c2c8417c4
Binary files /dev/null and b/LlamaFactory/src/llamafactory/train/sft/__pycache__/workflow.cpython-311.pyc differ
diff --git a/LlamaFactory/src/llamafactory/train/sft/metric.py b/LlamaFactory/src/llamafactory/train/sft/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..76ef1dec054916b8046e713478f84c44731076af
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/sft/metric.py
@@ -0,0 +1,134 @@
+# Copyright 2025 HuggingFace Inc., THUDM, and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library and the THUDM's ChatGLM implementation.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
+# https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import numpy as np
+import torch
+from transformers.utils import is_nltk_available
+
+from ...extras.constants import IGNORE_INDEX
+from ...extras.misc import numpify
+from ...extras.packages import is_jieba_available, is_rouge_available
+
+
+if TYPE_CHECKING:
+    from transformers import EvalPrediction, PreTrainedTokenizer
+
+
+if is_jieba_available():
+    import jieba  # type: ignore
+
+
+if is_nltk_available():
+    from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu  # type: ignore
+
+
+if is_rouge_available():
+    from rouge_chinese import Rouge  # type: ignore
+
+
+def eval_logit_processor(logits: "torch.Tensor", labels: "torch.Tensor") -> "torch.Tensor":
+    r"""Compute the token with the largest likelihood to reduce memory footprint."""
+    if isinstance(logits, (list, tuple)):
+        if logits[0].dim() == 3:  # (batch_size, seq_len, vocab_size)
+            logits = logits[0]
+        else:  # moe models have aux loss
+            logits = logits[1]
+
+    if logits.dim() != 3:
+        raise ValueError("Cannot process the logits.")
+
+    return torch.argmax(logits, dim=-1)
+
+
+@dataclass
+class ComputeAccuracy:
+    r"""Compute accuracy and support `batch_eval_metrics`."""
+
+    def _dump(self) -> Optional[dict[str, float]]:
+        result = None
+        if hasattr(self, "score_dict"):
+            result = {k: float(np.mean(v)) for k, v in self.score_dict.items()}
+
+        self.score_dict = {"accuracy": []}
+        return result
+
+    def __post_init__(self):
+        self._dump()
+
+    def __call__(self, eval_preds: "EvalPrediction", compute_result: bool = True) -> Optional[dict[str, float]]:
+        preds, labels = numpify(eval_preds.predictions), numpify(eval_preds.label_ids)
+        for i in range(len(preds)):
+            pred, label = preds[i, :-1], labels[i, 1:]
+            label_mask = label != IGNORE_INDEX
+            self.score_dict["accuracy"].append(np.mean(pred[label_mask] == label[label_mask]))
+
+        if compute_result:
+            return self._dump()
+
+
+@dataclass
+class ComputeSimilarity:
+    r"""Compute text similarity scores and support `batch_eval_metrics`.
+
+    Wraps the tokenizer into metric functions, used in CustomSeq2SeqTrainer.
+    """
+
+    tokenizer: "PreTrainedTokenizer"
+
+    def _dump(self) -> Optional[dict[str, float]]:
+        result = None
+        if hasattr(self, "score_dict"):
+            result = {k: float(np.mean(v)) for k, v in self.score_dict.items()}
+
+        self.score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []}
+        return result
+
+    def __post_init__(self):
+        self._dump()
+
+    def __call__(self, eval_preds: "EvalPrediction", compute_result: bool = True) -> Optional[dict[str, float]]:
+        preds, labels = numpify(eval_preds.predictions), numpify(eval_preds.label_ids)
+
+        preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id)
+        labels = np.where(labels != IGNORE_INDEX, labels, self.tokenizer.pad_token_id)
+
+        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
+        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+        for pred, label in zip(decoded_preds, decoded_labels):
+            hypothesis = list(jieba.cut(pred))
+            reference = list(jieba.cut(label))
+
+            if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0:
+                result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}}
+            else:
+                rouge = Rouge()
+                scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference))
+                result = scores[0]
+
+            for k, v in result.items():
+                self.score_dict[k].append(round(v["f"] * 100, 4))
+
+            bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
+            self.score_dict["bleu-4"].append(round(bleu_score * 100, 4))
+
+        if compute_result:
+            return self._dump()
diff --git a/LlamaFactory/src/llamafactory/train/sft/trainer.py b/LlamaFactory/src/llamafactory/train/sft/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1bfe194f10520b86dedd63d9dba9bb4560402ce
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/sft/trainer.py
@@ -0,0 +1,184 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer_seq2seq.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from types import MethodType
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import numpy as np
+import torch
+from transformers import Seq2SeqTrainer
+from typing_extensions import override
+
+from ...extras import logging
+from ...extras.constants import IGNORE_INDEX
+from ..callbacks import SaveProcessorCallback
+from ..fp8_utils import configure_fp8_environment, patch_accelerator_for_fp8, verify_fp8_status
+from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
+
+
+if TYPE_CHECKING:
+    from torch.utils.data import Dataset
+    from transformers import ProcessorMixin
+    from transformers.trainer import PredictionOutput
+
+    from ...hparams import FinetuningArguments, ModelArguments, TrainingArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class CustomSeq2SeqTrainer(Seq2SeqTrainer):
+    r"""Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE."""
+
+    def __init__(
+        self,
+        finetuning_args: "FinetuningArguments",
+        processor: Optional["ProcessorMixin"],
+        model_args: Optional["ModelArguments"] = None,
+        gen_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        kwargs["processing_class"] = kwargs.pop("tokenizer")
+        # Configure FP8 environment if enabled
+        training_args: TrainingArguments = kwargs.get("args")
+        if training_args.fp8:
+            configure_fp8_environment(training_args)
+            if getattr(training_args, "fp8_backend", "auto") == "te":
+                patch_accelerator_for_fp8()
+
+        super().__init__(**kwargs)
+        if processor is not None:
+            # avoid wrong loss under gradient accumulation
+            # https://github.com/huggingface/transformers/pull/36044#issuecomment-2746657112
+            self.model_accepts_loss_kwargs = False
+
+        self.finetuning_args = finetuning_args
+        if gen_kwargs is not None:
+            # https://github.com/huggingface/transformers/blob/v4.45.0/src/transformers/trainer_seq2seq.py#L287
+            self._gen_kwargs = gen_kwargs
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
+        if finetuning_args.use_badam:
+            from badam import BAdamCallback, clip_grad_norm_old_version  # type: ignore
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.add_callback(BAdamCallback)
+
+        if finetuning_args.use_dft_loss:
+            from ..trainer_utils import dft_loss_func
+
+            self.compute_loss_func = dft_loss_func
+
+        elif finetuning_args.use_eaft_loss:
+            from ..trainer_utils import eaft_loss_func
+
+            self.compute_loss_func = lambda outputs, labels, num_items_in_batch=None: eaft_loss_func(
+                outputs, labels, num_items_in_batch, finetuning_args.eaft_alpha
+            )
+
+        if training_args.fp8 and hasattr(self, "accelerator"):  # verify FP8 status after trainer initialization
+            verify_fp8_status(self.accelerator, training_args)
+
+    @override
+    def create_optimizer(self) -> "torch.optim.Optimizer":
+        if self.optimizer is None:
+            self.optimizer = create_custom_optimizer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
+
+    @override
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
+
+    @override
+    def _get_train_sampler(self, *args, **kwargs) -> Optional["torch.utils.data.Sampler"]:
+        if self.finetuning_args.disable_shuffling:
+            return torch.utils.data.SequentialSampler(self.train_dataset)
+
+        return super()._get_train_sampler(*args, **kwargs)
+
+    @override
+    def compute_loss(self, model, inputs, *args, **kwargs):
+        return super().compute_loss(model, inputs, *args, **kwargs)
+
+    @override
+    def prediction_step(
+        self,
+        model: "torch.nn.Module",
+        inputs: dict[str, Union["torch.Tensor", Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[list[str]] = None,
+        **gen_kwargs,
+    ) -> tuple[Optional[float], Optional["torch.Tensor"], Optional["torch.Tensor"]]:
+        r"""Remove the prompt part in the generated tokens.
+
+        Subclass and override to inject custom behavior.
+        """
+        if self.args.predict_with_generate:  # do not pass labels to model when generate
+            labels = inputs.pop("labels", None)
+        else:
+            labels = inputs.get("labels")
+
+        loss, generated_tokens, _ = super().prediction_step(
+            model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys, **gen_kwargs
+        )
+        if generated_tokens is not None and self.args.predict_with_generate:
+            generated_tokens[:, : inputs["input_ids"].size(-1)] = self.processing_class.pad_token_id
+            generated_tokens = generated_tokens.contiguous()
+
+        return loss, generated_tokens, labels
+
+    def save_predictions(
+        self, dataset: "Dataset", predict_results: "PredictionOutput", skip_special_tokens: bool = True
+    ) -> None:
+        r"""Save model predictions to `output_dir`.
+
+        A custom behavior that not contained in Seq2SeqTrainer.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl")
+        logger.info_rank0(f"Saving prediction results to {output_prediction_file}")
+
+        labels = np.where(
+            predict_results.label_ids != IGNORE_INDEX, predict_results.label_ids, self.processing_class.pad_token_id
+        )
+        preds = np.where(
+            predict_results.predictions != IGNORE_INDEX,
+            predict_results.predictions,
+            self.processing_class.pad_token_id,
+        )
+
+        for i in range(len(preds)):
+            pad_len = np.nonzero(preds[i] != self.processing_class.pad_token_id)[0]
+            if len(pad_len):  # move pad token to last
+                preds[i] = np.concatenate((preds[i][pad_len[0] :], preds[i][: pad_len[0]]), axis=-1)
+
+        decoded_inputs = self.processing_class.batch_decode(dataset["input_ids"], skip_special_tokens=False)
+        decoded_preds = self.processing_class.batch_decode(preds, skip_special_tokens=skip_special_tokens)
+        decoded_labels = self.processing_class.batch_decode(labels, skip_special_tokens=skip_special_tokens)
+
+        with open(output_prediction_file, "w", encoding="utf-8") as f:
+            for text, pred, label in zip(decoded_inputs, decoded_preds, decoded_labels):
+                f.write(json.dumps({"prompt": text, "predict": pred, "label": label}, ensure_ascii=False) + "\n")
diff --git a/LlamaFactory/src/llamafactory/train/sft/workflow.py b/LlamaFactory/src/llamafactory/train/sft/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bf14f1eb83a272820c02a4e8046b3a32ad64992
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/sft/workflow.py
@@ -0,0 +1,173 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional
+
+from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, get_template_and_fix_tokenizer
+from ...extras.constants import IGNORE_INDEX
+from ...extras.logging import get_logger
+from ...extras.misc import calculate_tps
+from ...extras.packages import is_transformers_version_greater_than
+from ...extras.ploting import plot_loss
+from ...model import load_model, load_tokenizer
+from ..trainer_utils import create_modelcard_and_push
+from .metric import ComputeAccuracy, ComputeSimilarity, eval_logit_processor
+from .trainer import CustomSeq2SeqTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+logger = get_logger(__name__)
+
+
+def run_sft(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    generating_args: "GeneratingArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="sft", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+
+    if getattr(model, "is_quantized", False) and not training_args.do_train:
+        setattr(model, "_hf_peft_config_loaded", True)  # hack here: make model compatible with prediction
+
+    data_collator = SFTDataCollatorWith4DAttentionMask(
+        template=template,
+        model=model if not training_args.predict_with_generate else None,
+        pad_to_multiple_of=8 if training_args.do_train else None,  # for shift short attention
+        label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id,
+        block_diag_attn=model_args.block_diag_attn,
+        attn_implementation=getattr(model.config, "_attn_implementation", None),
+        compute_dtype=model_args.compute_dtype,
+        **tokenizer_module,
+    )
+
+    # Metric utils
+    metric_module = {}
+    if model_args.use_kt:
+        if training_args.predict_with_generate:
+            raise NotImplementedError("`predict_with_generate` is not supported in KTransformers SFT yet.")
+        elif finetuning_args.compute_accuracy:
+            raise NotImplementedError("`compute_accuracy` is not supported in KTransformers SFT yet.")
+
+    if training_args.predict_with_generate:
+        metric_module["compute_metrics"] = ComputeSimilarity(tokenizer=tokenizer)
+    elif finetuning_args.compute_accuracy:
+        metric_module["compute_metrics"] = ComputeAccuracy()
+        metric_module["preprocess_logits_for_metrics"] = eval_logit_processor
+
+    # Keyword arguments for `model.generate`
+    gen_kwargs = generating_args.to_dict(obey_generation_config=True)
+
+    # Compatible with Transformers v4 and Transformers v5
+    if is_transformers_version_greater_than("4.58.0"):
+        extra_ids = getattr(tokenizer, "additional_special_tokens_ids", None)
+        if not isinstance(extra_ids, list):
+            extra_special_tokens = getattr(tokenizer, "_extra_special_tokens", [])
+            string_tokens = [str(t) for t in extra_special_tokens]
+            extra_ids = tokenizer.convert_tokens_to_ids(string_tokens)
+        all_eos_ids = [tokenizer.eos_token_id] + [i for i in extra_ids if i != -1]
+        unique_eos_ids = list(dict.fromkeys(all_eos_ids))
+        gen_kwargs["eos_token_id"] = unique_eos_ids
+    else:
+        gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids
+    gen_kwargs["pad_token_id"] = tokenizer.pad_token_id
+
+    # Initialize our Trainer
+    if model_args.use_kt:
+        from ktransformers.sft.lora import KTrainer  # type: ignore
+        from ktransformers.util.globals import GLOBAL_CONFIG  # type: ignore
+
+        GLOBAL_CONFIG._config["mod"] = "sft"
+
+        trainer = KTrainer(
+            model=model,
+            args=training_args,
+            tokenizer=tokenizer_module,
+            data_collator=data_collator,
+            callbacks=callbacks,
+            **dataset_module,
+            **metric_module,
+        )
+        trainer.model_accepts_loss_kwargs = False
+        model.config.use_cache = False
+
+    else:
+        trainer = CustomSeq2SeqTrainer(
+            model=model,
+            args=training_args,
+            finetuning_args=finetuning_args,
+            data_collator=data_collator,
+            callbacks=callbacks,
+            gen_kwargs=gen_kwargs,
+            **dataset_module,
+            **tokenizer_module,
+            **metric_module,
+        )
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        if finetuning_args.include_effective_tokens_per_second:
+            train_result.metrics["effective_tokens_per_sec"] = calculate_tps(
+                dataset_module["train_dataset"], train_result.metrics, stage="sft"
+            )
+
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            keys = ["loss"]
+            if isinstance(dataset_module.get("eval_dataset"), dict):
+                keys += sum(
+                    [[f"eval_{key}_loss", f"eval_{key}_accuracy"] for key in dataset_module["eval_dataset"].keys()], []
+                )
+            else:
+                keys += ["eval_loss", "eval_accuracy"]
+
+            plot_loss(training_args.output_dir, keys=keys)
+
+    if training_args.predict_with_generate:
+        tokenizer.padding_side = "left"  # use left-padding in generation
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Predict
+    if training_args.do_predict:
+        logger.warning_rank0_once("Batch generation can be very slow. Consider using `scripts/vllm_infer.py` instead.")
+        predict_results = trainer.predict(dataset_module["eval_dataset"], metric_key_prefix="predict", **gen_kwargs)
+        trainer.log_metrics("predict", predict_results.metrics)
+        trainer.save_metrics("predict", predict_results.metrics)
+        trainer.save_predictions(dataset_module["eval_dataset"], predict_results, generating_args.skip_special_tokens)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/LlamaFactory/src/llamafactory/train/test_utils.py b/LlamaFactory/src/llamafactory/train/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f31b3d2fc8b7b27fc60ec82a472d9da74ad519c4
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/test_utils.py
@@ -0,0 +1,115 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM
+from trl import AutoModelForCausalLMWithValueHead
+
+from ..data import get_dataset, get_template_and_fix_tokenizer
+from ..hparams import get_infer_args, get_train_args
+from ..model import load_model, load_tokenizer
+
+
+if TYPE_CHECKING:
+    from peft import LoraModel
+    from transformers import PreTrainedModel
+
+    from ..data.data_utils import DatasetModule
+
+
+def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module", diff_keys: list[str] = []) -> None:
+    state_dict_a = model_a.state_dict()
+    state_dict_b = model_b.state_dict()
+    assert set(state_dict_a.keys()) == set(state_dict_b.keys())
+    for name in state_dict_a.keys():
+        if any(key in name for key in diff_keys):
+            assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-4, atol=1e-5) is False
+        else:
+            assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-4, atol=1e-5) is True
+
+
+def check_lora_model(model: "LoraModel") -> tuple[set[str], set[str]]:
+    linear_modules, extra_modules = set(), set()
+    for name, param in model.named_parameters():
+        if any(module in name for module in ["lora_A", "lora_B"]):
+            linear_modules.add(name.split(".lora_", maxsplit=1)[0].split(".")[-1])
+            assert param.requires_grad is True
+            assert param.dtype == torch.float32
+        elif "modules_to_save" in name:
+            extra_modules.add(name.split(".modules_to_save", maxsplit=1)[0].split(".")[-1])
+            assert param.requires_grad is True
+            assert param.dtype == torch.float32
+        else:
+            assert param.requires_grad is False
+            assert param.dtype == torch.float16
+
+    return linear_modules, extra_modules
+
+
+def load_train_model(add_valuehead: bool = False, **kwargs) -> "PreTrainedModel":
+    model_args, _, _, finetuning_args, _ = get_train_args(kwargs)
+    tokenizer = load_tokenizer(model_args)["tokenizer"]
+    return load_model(tokenizer, model_args, finetuning_args, is_trainable=True, add_valuehead=add_valuehead)
+
+
+def load_infer_model(add_valuehead: bool = False, **kwargs) -> "PreTrainedModel":
+    model_args, _, finetuning_args, _ = get_infer_args(kwargs)
+    tokenizer = load_tokenizer(model_args)["tokenizer"]
+    return load_model(tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=add_valuehead)
+
+
+def load_reference_model(
+    model_path: str,
+    lora_path: Optional[str] = None,
+    use_lora: bool = False,
+    use_pissa: bool = False,
+    is_trainable: bool = False,
+    add_valuehead: bool = False,
+) -> Union["PreTrainedModel", "LoraModel"]:
+    if add_valuehead:
+        model: AutoModelForCausalLMWithValueHead = AutoModelForCausalLMWithValueHead.from_pretrained(
+            model_path, torch_dtype=torch.float16, device_map="auto"
+        )
+
+        return model
+
+    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
+    if use_lora or use_pissa:
+        model = PeftModel.from_pretrained(
+            model, lora_path, subfolder="pissa_init" if use_pissa else None, is_trainable=is_trainable
+        )
+        for param in filter(lambda p: p.requires_grad, model.parameters()):
+            param.data = param.data.to(torch.float32)
+
+    return model
+
+
+def load_dataset_module(**kwargs) -> "DatasetModule":
+    model_args, data_args, training_args, _, _ = get_train_args(kwargs)
+    tokenizer_module = load_tokenizer(model_args)
+    template = get_template_and_fix_tokenizer(tokenizer_module["tokenizer"], data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, kwargs["stage"], **tokenizer_module)
+    return dataset_module
+
+
+def patch_valuehead_model() -> None:
+    def post_init(self: "AutoModelForCausalLMWithValueHead", state_dict: dict[str, "torch.Tensor"]) -> None:
+        state_dict = {k[7:]: state_dict[k] for k in state_dict.keys() if k.startswith("v_head.")}
+        self.v_head.load_state_dict(state_dict, strict=False)
+        del state_dict
+
+    AutoModelForCausalLMWithValueHead.post_init = post_init
diff --git a/LlamaFactory/src/llamafactory/train/trainer_utils.py b/LlamaFactory/src/llamafactory/train/trainer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e58316092ceaeddda9413509f8fda51b098f967f
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/trainer_utils.py
@@ -0,0 +1,894 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the original GaLore's implementation: https://github.com/jiaweizzhao/GaLore
+# and the original LoRA+'s implementation: https://github.com/nikhil-ghosh-berkeley/loraplus
+# and the original BAdam's implementation: https://github.com/Ledzy/BAdam
+# and the HuggingFace's TRL library: https://github.com/huggingface/trl
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections.abc import Callable, Mapping
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import torch
+from transformers import Trainer
+from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.modeling_utils import is_fsdp_enabled
+from transformers.optimization import get_scheduler
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.trainer_pt_utils import get_parameter_names
+from typing_extensions import override
+
+from ..extras import logging
+from ..extras.constants import IGNORE_INDEX, SWANLAB_CONFIG
+from ..extras.misc import get_device_name
+from ..extras.packages import is_apollo_available, is_galore_available, is_ray_available
+from ..hparams import FinetuningArguments, ModelArguments
+from ..model import find_all_linear_modules, load_model, load_tokenizer, load_valuehead_params
+
+
+if is_galore_available():
+    from galore_torch import GaLoreAdafactor, GaLoreAdamW, GaLoreAdamW8bit  # type: ignore
+
+
+if is_apollo_available():
+    from apollo_torch import APOLLOAdamW  # type: ignore
+
+
+if is_ray_available():
+    import ray
+    from ray.util.placement_group import PlacementGroup, placement_group
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, TrainerCallback, TrainerState
+    from trl import AutoModelForCausalLMWithValueHead
+
+    from ..hparams import DataArguments, TrainingArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class DummyOptimizer(torch.optim.Optimizer):
+    r"""A dummy optimizer used for the GaLore or APOLLO algorithm."""
+
+    def __init__(
+        self, lr: float = 1e-3, optimizer_dict: Optional[dict["torch.nn.Parameter", "torch.optim.Optimizer"]] = None
+    ) -> None:
+        dummy_tensor = torch.randn(1, 1)
+        self.optimizer_dict = optimizer_dict
+        super().__init__([dummy_tensor], {"lr": lr})
+
+    @override
+    def zero_grad(self, set_to_none: bool = True) -> None:
+        pass
+
+    @override
+    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
+        pass
+
+
+def create_modelcard_and_push(
+    trainer: "Trainer",
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "TrainingArguments",
+    finetuning_args: "FinetuningArguments",
+) -> None:
+    kwargs = {
+        "tasks": "text-generation",
+        "finetuned_from": model_args.model_name_or_path,
+        "tags": ["llama-factory", finetuning_args.finetuning_type],
+    }
+    if data_args.dataset is not None:
+        kwargs["dataset"] = data_args.dataset
+
+    if model_args.use_unsloth:
+        kwargs["tags"] = kwargs["tags"] + ["unsloth"]
+
+    if model_args.use_kt:
+        kwargs["tags"] = kwargs["tags"] + ["ktransformers"]
+
+    if not training_args.do_train:
+        pass
+    elif training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        Trainer.create_model_card(trainer, license="other", **kwargs)  # prevent from connecting to hub
+
+
+def create_ref_model(
+    model_args: "ModelArguments", finetuning_args: "FinetuningArguments", add_valuehead: bool = False
+) -> Optional[Union["PreTrainedModel", "AutoModelForCausalLMWithValueHead"]]:
+    r"""Create reference model for PPO/DPO training. Evaluation mode is not supported.
+
+    The valuehead parameter is randomly initialized since it is useless for PPO training.
+    """
+    if finetuning_args.ref_model is not None:
+        ref_model_args = ModelArguments.copyfrom(
+            model_args,
+            model_name_or_path=finetuning_args.ref_model,
+            adapter_name_or_path=finetuning_args.ref_model_adapters,
+            quantization_bit=finetuning_args.ref_model_quantization_bit,
+        )
+        ref_finetuning_args = FinetuningArguments()
+        tokenizer = load_tokenizer(ref_model_args)["tokenizer"]
+        ref_model = load_model(
+            tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead
+        )
+        logger.info_rank0(f"Created reference model from {finetuning_args.ref_model}")
+    else:
+        if finetuning_args.finetuning_type == "lora":
+            ref_model = None
+        else:
+            ref_model_args = ModelArguments.copyfrom(model_args)
+            ref_finetuning_args = FinetuningArguments()
+            tokenizer = load_tokenizer(ref_model_args)["tokenizer"]
+            ref_model = load_model(
+                tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead
+            )
+            logger.info_rank0("Created reference model from the model itself.")
+
+    return ref_model
+
+
+def create_reward_model(
+    model: "AutoModelForCausalLMWithValueHead", model_args: "ModelArguments", finetuning_args: "FinetuningArguments"
+) -> Optional["AutoModelForCausalLMWithValueHead"]:
+    r"""Create reward model for PPO training."""
+    if finetuning_args.reward_model_type == "api":
+        assert finetuning_args.reward_model.startswith("http"), "Please provide full url."
+        logger.info_rank0(f"Use reward server {finetuning_args.reward_model}")
+        return finetuning_args.reward_model
+    elif finetuning_args.reward_model_type == "lora":
+        model.pretrained_model.load_adapter(finetuning_args.reward_model, "reward")
+        for name, param in model.named_parameters():  # https://github.com/huggingface/peft/issues/1090
+            if "default" in name:
+                param.data = param.data.to(torch.float32)  # trainable params should in fp32
+        vhead_params = load_valuehead_params(finetuning_args.reward_model, model_args)
+        assert vhead_params is not None, "Reward model is not correctly loaded."
+        model.register_buffer("reward_head_weight", vhead_params["v_head.summary.weight"], persistent=False)
+        model.register_buffer("reward_head_bias", vhead_params["v_head.summary.bias"], persistent=False)
+        model.register_buffer(
+            "default_head_weight", torch.zeros_like(vhead_params["v_head.summary.weight"]), persistent=False
+        )
+        model.register_buffer(
+            "default_head_bias", torch.zeros_like(vhead_params["v_head.summary.bias"]), persistent=False
+        )
+        logger.info_rank0(f"Loaded adapter weights of reward model from {finetuning_args.reward_model}")
+        return None
+    else:
+        reward_model_args = ModelArguments.copyfrom(
+            model_args,
+            model_name_or_path=finetuning_args.reward_model,
+            adapter_name_or_path=finetuning_args.reward_model_adapters,
+            quantization_bit=finetuning_args.reward_model_quantization_bit,
+        )
+        reward_finetuning_args = FinetuningArguments()
+        tokenizer = load_tokenizer(reward_model_args)["tokenizer"]
+        reward_model = load_model(
+            tokenizer, reward_model_args, reward_finetuning_args, is_trainable=False, add_valuehead=True
+        )
+        logger.info_rank0(f"Loaded full weights of reward model from {finetuning_args.reward_model}")
+        logger.warning_rank0("Please ensure the ppo model and reward model share SAME tokenizer and vocabulary.")
+        return reward_model
+
+
+def _get_decay_parameter_names(model: "PreTrainedModel") -> list[str]:
+    r"""Return a list of names of parameters with weight decay. (weights in non-layernorm layers)."""
+    decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS)
+    decay_parameters = [name for name in decay_parameters if "bias" not in name]
+    return decay_parameters
+
+
+def _create_galore_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+    finetuning_args: "FinetuningArguments",
+) -> "torch.optim.Optimizer":
+    if len(finetuning_args.galore_target) == 1 and finetuning_args.galore_target[0] == "all":
+        galore_targets = find_all_linear_modules(model, finetuning_args.freeze_vision_tower)
+    else:
+        galore_targets = finetuning_args.galore_target
+
+    galore_params: list[torch.nn.Parameter] = []
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear) and any(target in name for target in galore_targets):
+            for param in module.parameters():
+                if param.requires_grad and len(param.shape) > 1:
+                    galore_params.append(param)
+
+    galore_kwargs = {
+        "rank": finetuning_args.galore_rank,
+        "update_proj_gap": finetuning_args.galore_update_interval,
+        "scale": finetuning_args.galore_scale,
+        "proj_type": finetuning_args.galore_proj_type,
+    }
+
+    id_galore_params = {id(param) for param in galore_params}
+    decay_params, nodecay_params = [], []  # they are non-galore parameters
+    trainable_params: list[torch.nn.Parameter] = []  # galore_params + decay_params + nodecay_params
+    decay_param_names = _get_decay_parameter_names(model)
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            trainable_params.append(param)
+            if id(param) not in id_galore_params:
+                if name in decay_param_names:
+                    decay_params.append(param)
+                else:
+                    nodecay_params.append(param)
+
+    _, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+
+    if training_args.optim == "adamw_torch":
+        optim_class = GaLoreAdamW
+    elif training_args.optim in ["adamw_bnb_8bit", "adamw_8bit", "paged_adamw_8bit"]:
+        optim_class = GaLoreAdamW8bit
+    elif training_args.optim == "adafactor":
+        optim_class = GaLoreAdafactor
+    else:
+        raise NotImplementedError(f"Unknown optim: {training_args.optim}.")
+
+    if finetuning_args.galore_layerwise:
+        logger.warning_rank0("The displayed gradient norm will be all zeros in layerwise GaLore.")
+        if training_args.gradient_accumulation_steps != 1:
+            raise ValueError("Per-layer GaLore does not support gradient accumulation.")
+
+        optimizer_dict: dict[torch.Tensor, torch.optim.Optimizer] = {}
+        for param in nodecay_params:
+            param_groups = [dict(params=[param], weight_decay=0.0)]
+            optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
+        for param in decay_params:
+            param_groups = [dict(params=[param], weight_decay=training_args.weight_decay)]
+            optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
+        for param in galore_params:  # galore params have weight decay
+            param_groups = [dict(params=[param], weight_decay=training_args.weight_decay, **galore_kwargs)]
+            optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
+
+        def optimizer_hook(param: "torch.nn.Parameter"):
+            if param.grad is not None:
+                optimizer_dict[param].step()
+                optimizer_dict[param].zero_grad()
+
+        for param in trainable_params:
+            param.register_post_accumulate_grad_hook(optimizer_hook)
+
+        optimizer = DummyOptimizer(lr=training_args.learning_rate, optimizer_dict=optimizer_dict)
+    else:
+        param_groups = [
+            dict(params=nodecay_params, weight_decay=0.0),
+            dict(params=decay_params, weight_decay=training_args.weight_decay),
+            dict(params=galore_params, weight_decay=training_args.weight_decay, **galore_kwargs),
+        ]
+        optimizer = optim_class(param_groups, **optim_kwargs)
+
+    logger.info_rank0(
+        f"Using GaLore optimizer with args: {galore_kwargs}. "
+        "It may cause hanging at the start of training, wait patiently."
+    )
+    return optimizer
+
+
+def _create_apollo_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+    finetuning_args: "FinetuningArguments",
+) -> "torch.optim.Optimizer":
+    if len(finetuning_args.apollo_target) == 1 and finetuning_args.apollo_target[0] == "all":
+        apollo_targets = find_all_linear_modules(model, finetuning_args.freeze_vision_tower)
+    else:
+        apollo_targets = finetuning_args.apollo_target
+
+    apollo_params: list[torch.nn.Parameter] = []
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear) and any(target in name for target in apollo_targets):
+            for param in module.parameters():
+                if param.requires_grad and len(param.shape) > 1:
+                    apollo_params.append(param)
+
+    apollo_kwargs = {
+        "rank": finetuning_args.apollo_rank,
+        "proj": finetuning_args.apollo_proj,
+        "proj_type": finetuning_args.apollo_proj_type,
+        "update_proj_gap": finetuning_args.apollo_update_interval,
+        "scale": finetuning_args.apollo_scale,
+        "scale_type": finetuning_args.apollo_scale_type,
+        "scale_front": finetuning_args.apollo_scale_front,
+    }
+
+    id_apollo_params = {id(param) for param in apollo_params}
+    decay_params, nodecay_params = [], []  # they are non-apollo parameters
+    trainable_params: list[torch.nn.Parameter] = []  # apollo_params + decay_params + nodecay_params
+    decay_param_names = _get_decay_parameter_names(model)
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            trainable_params.append(param)
+            if id(param) not in id_apollo_params:
+                if name in decay_param_names:
+                    decay_params.append(param)
+                else:
+                    nodecay_params.append(param)
+
+    _, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+
+    if training_args.optim == "adamw_torch":
+        optim_class = APOLLOAdamW
+    else:
+        raise NotImplementedError(f"Unknown optim: {training_args.optim}.")
+
+    if finetuning_args.apollo_layerwise:
+        logger.warning_rank0("The displayed gradient norm will be all zeros in layerwise APOLLO.")
+        if training_args.gradient_accumulation_steps != 1:
+            raise ValueError("Per-layer APOLLO does not support gradient accumulation.")
+
+        optimizer_dict: dict[torch.Tensor, torch.optim.Optimizer] = {}
+        for param in nodecay_params:
+            param_groups = [dict(params=[param], weight_decay=0.0)]
+            optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
+        for param in decay_params:
+            param_groups = [dict(params=[param], weight_decay=training_args.weight_decay)]
+            optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
+        for param in apollo_params:  # apollo params have weight decay
+            param_groups = [dict(params=[param], weight_decay=training_args.weight_decay, **apollo_kwargs)]
+            optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
+
+        def optimizer_hook(param: "torch.nn.Parameter"):
+            if param.grad is not None:
+                optimizer_dict[param].step()
+                optimizer_dict[param].zero_grad()
+
+        for param in trainable_params:
+            param.register_post_accumulate_grad_hook(optimizer_hook)
+
+        optimizer = DummyOptimizer(lr=training_args.learning_rate, optimizer_dict=optimizer_dict)
+    else:
+        param_groups = [
+            dict(params=nodecay_params, weight_decay=0.0),
+            dict(params=decay_params, weight_decay=training_args.weight_decay),
+            dict(params=apollo_params, weight_decay=training_args.weight_decay, **apollo_kwargs),
+        ]
+        optimizer = optim_class(param_groups, **optim_kwargs)
+
+    logger.info_rank0(f"Using APOLLO optimizer with args: {apollo_kwargs}.")
+    return optimizer
+
+
+def _create_loraplus_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+    finetuning_args: "FinetuningArguments",
+) -> "torch.optim.Optimizer":
+    default_lr = training_args.learning_rate
+    loraplus_lr = training_args.learning_rate * finetuning_args.loraplus_lr_ratio
+    embedding_lr = finetuning_args.loraplus_lr_embedding
+
+    decay_param_names = _get_decay_parameter_names(model)
+    param_dict: dict[str, list[torch.nn.Parameter]] = {
+        "lora_a": [],
+        "lora_b": [],
+        "lora_b_nodecay": [],
+        "embedding": [],
+    }
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if "lora_embedding_B" in name:
+                param_dict["embedding"].append(param)
+            elif "lora_B" in name or param.ndim == 1:
+                if name in decay_param_names:
+                    param_dict["lora_b"].append(param)
+                else:
+                    param_dict["lora_b_nodecay"].append(param)
+            else:
+                param_dict["lora_a"].append(param)
+
+    optim_class, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+    param_groups = [
+        dict(params=param_dict["lora_a"], lr=default_lr, weight_decay=training_args.weight_decay),
+        dict(params=param_dict["lora_b"], lr=loraplus_lr, weight_decay=training_args.weight_decay),
+        dict(params=param_dict["lora_b_nodecay"], lr=loraplus_lr, weight_decay=0.0),
+        dict(params=param_dict["embedding"], lr=embedding_lr, weight_decay=training_args.weight_decay),
+    ]
+    optimizer = optim_class(param_groups, **optim_kwargs)
+    logger.info_rank0(f"Using LoRA+ optimizer with loraplus lr ratio {finetuning_args.loraplus_lr_ratio:.2f}.")
+    return optimizer
+
+
+def _create_badam_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+    finetuning_args: "FinetuningArguments",
+) -> "torch.optim.Optimizer":
+    decay_params, nodecay_params = [], []
+    decay_param_names = _get_decay_parameter_names(model)
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if name in decay_param_names:
+                decay_params.append(param)
+            else:
+                nodecay_params.append(param)
+
+    optim_class, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+    param_groups = [
+        dict(params=nodecay_params, weight_decay=0.0),
+        dict(params=decay_params, weight_decay=training_args.weight_decay),
+    ]
+
+    if finetuning_args.badam_mode == "layer":
+        from badam import BlockOptimizer  # type: ignore
+
+        base_optimizer = optim_class(param_groups, **optim_kwargs)
+        optimizer = BlockOptimizer(
+            base_optimizer=base_optimizer,
+            named_parameters_list=list(model.named_parameters()),
+            block_prefix_list=None,
+            switch_block_every=finetuning_args.badam_switch_interval,
+            start_block=finetuning_args.badam_start_block,
+            switch_mode=finetuning_args.badam_switch_mode,
+            verbose=finetuning_args.badam_verbose,
+            ds_zero3_enabled=is_deepspeed_zero3_enabled(),
+        )
+        logger.info_rank0(
+            f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "
+            f"switch block every {finetuning_args.badam_switch_interval} steps, "
+            f"default start block is {finetuning_args.badam_start_block}"
+        )
+
+    elif finetuning_args.badam_mode == "ratio":
+        from badam import BlockOptimizerRatio  # type: ignore
+
+        assert finetuning_args.badam_update_ratio > 1e-6
+        optimizer = BlockOptimizerRatio(
+            param_groups=param_groups,
+            named_parameters_list=list(model.named_parameters()),
+            update_ratio=finetuning_args.badam_update_ratio,
+            mask_mode=finetuning_args.badam_mask_mode,
+            verbose=finetuning_args.badam_verbose,
+            include_embedding=False,
+            **optim_kwargs,
+        )
+        logger.info_rank0(
+            f"Using BAdam optimizer with ratio-based update, update ratio is {finetuning_args.badam_update_ratio}, "
+            f"mask mode is {finetuning_args.badam_mask_mode}"
+        )
+
+    return optimizer
+
+
+def _create_adam_mini_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+) -> "torch.optim.Optimizer":
+    from adam_mini import Adam_mini  # type: ignore
+
+    hidden_size = getattr(model.config, "hidden_size", None)
+    num_q_head = getattr(model.config, "num_attention_heads", None)
+    num_kv_head = getattr(model.config, "num_key_value_heads", None)
+
+    optimizer = Adam_mini(
+        named_parameters=model.named_parameters(),
+        lr=training_args.learning_rate,
+        betas=(training_args.adam_beta1, training_args.adam_beta2),
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+        model_sharding=is_fsdp_enabled() or is_deepspeed_zero3_enabled(),
+        dim=hidden_size,
+        n_heads=num_q_head,
+        n_kv_heads=num_kv_head,
+    )
+    logger.info_rank0("Using Adam-mini optimizer.")
+    return optimizer
+
+
+def _create_muon_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+) -> "torch.optim.Optimizer":
+    from ..third_party.muon import Muon
+
+    muon_params, adamw_params = [], []
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            # Use Muon for 2D parameters that aren't embeddings or heads
+            if param.ndim == 2 and "embed" not in name and "lm_head" not in name:
+                muon_params.append(param)
+            else:
+                adamw_params.append(param)
+
+    optimizer = Muon(
+        lr=training_args.learning_rate,
+        wd=training_args.weight_decay,
+        muon_params=muon_params,
+        adamw_params=adamw_params,
+        adamw_betas=(training_args.adam_beta1, training_args.adam_beta2),
+        adamw_eps=training_args.adam_epsilon,
+    )
+    logger.info_rank0(
+        f"Using Muon optimizer with {len(muon_params)} Muon params and {len(adamw_params)} AdamW params."
+    )
+    return optimizer
+
+
+def create_custom_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+    finetuning_args: "FinetuningArguments",
+) -> Optional["torch.optim.Optimizer"]:
+    if finetuning_args.use_galore:
+        return _create_galore_optimizer(model, training_args, finetuning_args)
+
+    if finetuning_args.use_apollo:
+        return _create_apollo_optimizer(model, training_args, finetuning_args)
+
+    if finetuning_args.loraplus_lr_ratio is not None:
+        return _create_loraplus_optimizer(model, training_args, finetuning_args)
+
+    if finetuning_args.use_badam:
+        return _create_badam_optimizer(model, training_args, finetuning_args)
+
+    if finetuning_args.use_adam_mini:
+        return _create_adam_mini_optimizer(model, training_args)
+
+    if finetuning_args.use_muon:
+        return _create_muon_optimizer(model, training_args)
+
+
+def create_custom_scheduler(
+    training_args: "TrainingArguments",
+    num_training_steps: int,
+    optimizer: Optional["torch.optim.Optimizer"] = None,
+) -> None:
+    if training_args.lr_scheduler_type == "warmup_stable_decay":
+        num_warmup_steps = training_args.get_warmup_steps(num_training_steps)
+        remaining_steps = num_training_steps - num_warmup_steps
+        num_stable_steps = remaining_steps // 3  # use 1/3 for stable by default
+        num_decay_steps = remaining_steps - num_stable_steps
+        scheduler_kwargs = training_args.lr_scheduler_kwargs or {}
+        default_kwargs = {
+            "num_stable_steps": num_stable_steps,
+            "num_decay_steps": num_decay_steps,
+        }
+        for key, value in default_kwargs.items():
+            if key not in scheduler_kwargs:
+                scheduler_kwargs[key] = value
+
+        training_args.lr_scheduler_kwargs = scheduler_kwargs
+
+    if optimizer is not None and isinstance(optimizer, DummyOptimizer):
+        optimizer_dict = optimizer.optimizer_dict
+        scheduler_dict: dict[torch.nn.Parameter, torch.optim.lr_scheduler.LRScheduler] = {}
+
+        for param in optimizer_dict.keys():
+            scheduler_dict[param] = get_scheduler(
+                training_args.lr_scheduler_type,
+                optimizer=optimizer_dict[param],
+                num_warmup_steps=training_args.get_warmup_steps(num_training_steps),
+                num_training_steps=num_training_steps,
+                scheduler_specific_kwargs=training_args.lr_scheduler_kwargs,
+            )
+
+        def scheduler_hook(param: "torch.nn.Parameter"):
+            scheduler_dict[param].step()
+
+        for param in optimizer_dict.keys():
+            param.register_post_accumulate_grad_hook(scheduler_hook)
+
+
+def get_batch_logps(
+    logits: "torch.Tensor",
+    labels: "torch.Tensor",
+    label_pad_token_id: int = IGNORE_INDEX,
+    ld_alpha: Optional[float] = None,
+) -> tuple["torch.Tensor", "torch.Tensor"]:
+    r"""Compute the log probabilities of the given labels under the given logits.
+
+    Returns:
+        logps: A tensor of shape (batch_size,) containing the sum of log probabilities.
+        valid_length: A tensor of shape (batch_size,) containing the number of non-masked tokens.
+
+    """
+    if logits.shape[:-1] != labels.shape:
+        raise ValueError("Logits (batchsize x seqlen) and labels must have the same shape.")
+
+    labels = labels[:, 1:].clone()
+    logits = logits[:, :-1, :]
+    loss_mask = labels != label_pad_token_id
+    labels[labels == label_pad_token_id] = 0  # dummy token
+    per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2)
+
+    valid_length = loss_mask.sum(-1)
+    if ld_alpha is not None:
+        num_examples = labels.shape[0] // 2
+        chosen_lengths = valid_length[:num_examples]
+        rejected_lengths = valid_length[num_examples:]
+        min_lengths = torch.min(chosen_lengths, rejected_lengths)
+        start_positions = torch.argmax(loss_mask.int(), dim=1)
+        public_lengths = start_positions + torch.cat([min_lengths, min_lengths], dim=0)
+
+        seq_len = labels.shape[-1]
+        position_ids = torch.arange(seq_len, device=per_token_logps.device).expand_as(per_token_logps)
+
+        ld_mask = position_ids < public_lengths.unsqueeze(1)
+        front_mask = (ld_mask * loss_mask).float()
+        rear_mask = (~ld_mask * loss_mask).float()
+
+        front_logps = (per_token_logps * front_mask).sum(-1)
+        rear_logps = (per_token_logps * rear_mask).sum(-1)
+        logps = front_logps + ld_alpha * rear_logps
+    else:
+        logps = (per_token_logps * loss_mask).sum(-1)
+
+    return logps, valid_length
+
+
+def dft_loss_func(
+    outputs: "torch.Tensor", labels: "torch.Tensor", num_items_in_batch: Optional["torch.Tensor"] = None
+):
+    logits = outputs.get("logits")
+    if logits is None:
+        return outputs.get("loss", torch.tensor(0.0))
+
+    logits = logits.float()
+    vocab_size = logits.size(-1)
+    labels = torch.nn.functional.pad(labels, (0, 1), value=-100)
+    shift_labels = labels[..., 1:].contiguous()
+    logits = logits.view(-1, vocab_size)
+    shift_labels = shift_labels.view(-1)
+    shift_labels = shift_labels.to(logits.device)
+
+    loss = _dft_cross_entropy(logits, shift_labels, num_items_in_batch)
+    return loss
+
+
+def _dft_cross_entropy(
+    source: "torch.Tensor",
+    target: "torch.Tensor",
+    num_items_in_batch: Optional["torch.Tensor"] = None,
+    ignore_index: int = -100,
+) -> "torch.Tensor":
+    per_token_loss = torch.nn.functional.cross_entropy(source, target, ignore_index=ignore_index, reduction="none")
+    valid_mask = target != ignore_index
+    if not valid_mask.any():
+        return torch.tensor(0.0, device=source.device, dtype=source.dtype)
+
+    valid_losses = per_token_loss[valid_mask]
+
+    with torch.no_grad():
+        target_probs = torch.exp(-valid_losses)
+
+    weighted_losses = valid_losses * target_probs
+
+    if num_items_in_batch is not None:
+        total_loss = weighted_losses.sum()
+        if torch.is_tensor(num_items_in_batch):
+            num_items_in_batch = num_items_in_batch.to(total_loss.device)
+        loss = total_loss / num_items_in_batch
+    else:
+        loss = weighted_losses.mean()
+    return loss
+
+
+def eaft_loss_func(
+    outputs: "torch.Tensor",
+    labels: "torch.Tensor",
+    num_items_in_batch: Optional["torch.Tensor"] = None,
+    alpha: float = 1.0,
+) -> "torch.Tensor":
+    logits = outputs.get("logits")
+    if logits is None:
+        return outputs.get("loss", torch.tensor(0.0))
+
+    logits = logits.float()
+    vocab_size = logits.size(-1)
+    labels = torch.nn.functional.pad(labels, (0, 1), value=-100)
+    shift_labels = labels[..., 1:].contiguous()
+    logits = logits.view(-1, vocab_size)
+    shift_labels = shift_labels.view(-1)
+    shift_labels = shift_labels.to(logits.device)
+
+    loss = _eaft_cross_entropy(logits, shift_labels, num_items_in_batch, alpha)
+    return loss
+
+
+def _eaft_cross_entropy(
+    source: "torch.Tensor",
+    target: "torch.Tensor",
+    num_items_in_batch: Optional["torch.Tensor"] = None,
+    alpha: float = 1.0,
+    ignore_index: int = -100,
+) -> "torch.Tensor":
+    per_token_loss = torch.nn.functional.cross_entropy(source, target, ignore_index=ignore_index, reduction="none")
+    valid_mask = target != ignore_index
+    if not valid_mask.any():
+        return torch.tensor(0.0, device=source.device, dtype=source.dtype)
+
+    valid_losses = per_token_loss[valid_mask]
+
+    with torch.no_grad():
+        source_detached = source[valid_mask].detach()
+
+        topk_val, _ = torch.topk(source_detached, k=20, dim=-1)
+        logsumexp_topk = torch.logsumexp(topk_val, dim=-1, keepdim=True)
+        log_probs_topk = topk_val - logsumexp_topk
+        probs_topk = torch.exp(log_probs_topk)
+        entropy_approx = -(probs_topk * log_probs_topk).sum(dim=-1)
+
+        entropy_term = entropy_approx / 3.0
+        adaptive_weight = torch.pow(entropy_term, alpha)
+
+    weighted_losses = valid_losses * adaptive_weight
+
+    if num_items_in_batch is not None:
+        total_loss = weighted_losses.sum()
+        if torch.is_tensor(num_items_in_batch):
+            num_items_in_batch = num_items_in_batch.to(total_loss.device)
+        loss = total_loss / num_items_in_batch
+    else:
+        loss = weighted_losses.mean()
+
+    return loss
+
+
+def nested_detach(
+    tensors: Union["torch.Tensor", list["torch.Tensor"], tuple["torch.Tensor"], dict[str, "torch.Tensor"]],
+    clone: bool = False,
+):
+    r"""Detach `tensors` (even if it's a nested list/tuple/dict of tensors)."""
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(nested_detach(t, clone=clone) for t in tensors)
+    elif isinstance(tensors, Mapping):
+        return type(tensors)({k: nested_detach(t, clone=clone) for k, t in tensors.items()})
+
+    if isinstance(tensors, torch.Tensor):
+        if clone:
+            return tensors.detach().clone()
+        else:
+            return tensors.detach()
+    else:
+        return tensors
+
+
+def get_swanlab_callback(finetuning_args: "FinetuningArguments") -> "TrainerCallback":
+    r"""Get the callback for logging to SwanLab."""
+    import swanlab  # type: ignore
+    from swanlab.integration.transformers import SwanLabCallback  # type: ignore
+
+    if finetuning_args.swanlab_api_key is not None:
+        swanlab.login(api_key=finetuning_args.swanlab_api_key)
+
+    if finetuning_args.swanlab_lark_webhook_url is not None:
+        from swanlab.plugin.notification import LarkCallback  # type: ignore
+
+        lark_callback = LarkCallback(
+            webhook_url=finetuning_args.swanlab_lark_webhook_url,
+            secret=finetuning_args.swanlab_lark_secret,
+        )
+        swanlab.register_callbacks([lark_callback])
+
+    class SwanLabCallbackExtension(SwanLabCallback):
+        def setup(self, args: "TrainingArguments", state: "TrainerState", model: "PreTrainedModel", **kwargs):
+            if not state.is_world_process_zero:
+                return
+
+            super().setup(args, state, model, **kwargs)
+            try:
+                if hasattr(self, "_swanlab"):
+                    swanlab_public_config = self._swanlab.get_run().public.json()
+                else:  # swanlab <= 0.4.9
+                    swanlab_public_config = self._experiment.get_run().public.json()
+            except Exception:
+                swanlab_public_config = {}
+
+            with open(os.path.join(args.output_dir, SWANLAB_CONFIG), "w") as f:
+                f.write(json.dumps(swanlab_public_config, indent=2))
+
+    swanlab_callback = SwanLabCallbackExtension(
+        project=finetuning_args.swanlab_project,
+        workspace=finetuning_args.swanlab_workspace,
+        experiment_name=finetuning_args.swanlab_run_name,
+        mode=finetuning_args.swanlab_mode,
+        config={"Framework": "🦙LlamaFactory"},
+        logdir=finetuning_args.swanlab_logdir,
+        tags=["🦙LlamaFactory"],
+    )
+    return swanlab_callback
+
+
+def get_placement_group(num_workers: int) -> tuple["PlacementGroup", dict[str, int]]:
+    r"""Get the Ray placement group for distributed training."""
+    bundle = {"CPU": 10}
+    device_name = get_device_name().upper()
+    if device_name != "CPU":
+        bundle[device_name] = 1
+    bundles = [bundle for _ in range(num_workers)]
+    pg = placement_group(bundles, strategy="PACK")
+
+    return pg, bundle
+
+
+def get_ray_remote_config_for_worker(
+    placement_group: "PlacementGroup",
+    bundle_idx: int,
+    rank: int,
+    world_size: int,
+    master_addr: str,
+    master_port: str,
+    env: dict[str, str] = None,
+) -> dict[str, Any]:
+    r"""Get the remote config for a Ray worker."""
+    env_vars = {
+        "RANK": str(rank),
+        "WORLD_SIZE": str(world_size),
+        "MASTER_ADDR": master_addr,
+        "MASTER_PORT": master_port,
+        "TORCHELASTIC_USE_AGENT_STORE": "False",
+    }
+    env.update(env_vars)
+
+    remote_config = {
+        "scheduling_strategy": PlacementGroupSchedulingStrategy(
+            placement_group=placement_group,
+            placement_group_bundle_index=bundle_idx,
+        ),
+        "runtime_env": {"env_vars": env},
+        "num_cpus": 10,
+    }
+
+    device_name = get_device_name()
+    if device_name == "gpu":
+        remote_config["num_gpus"] = 1
+    elif device_name == "npu":
+        remote_config["resources"] = {"NPU": 1}
+
+    return remote_config
+
+
+def get_ray_head_node_ip() -> str:
+    r"""Get the IP address of the Ray head node."""
+    head_ip = next(node["NodeManagerAddress"] for node in ray.nodes() if node.get("IsHead", False))
+    return head_ip
+
+
+def sort_placement_group_by_node_ip(placement_group: "PlacementGroup", master_addr: str = None) -> list[int]:
+    r"""Sort the placement group bundles by their node IP addresses."""
+
+    @ray.remote
+    def _get_node_ip():
+        return ray.util.get_node_ip_address().strip("[]")
+
+    tasks = []
+    for bundle_idx in range(placement_group.bundle_count):
+        task = _get_node_ip.options(
+            scheduling_strategy=PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_bundle_index=bundle_idx,
+            ),
+        ).remote()
+        tasks.append(task)
+
+    bundle_ips = ray.get(tasks)
+    bundle_node_ip_list = list(enumerate(bundle_ips))
+
+    sorted_bundle_node_ip_list = sorted(bundle_node_ip_list, key=lambda x: x[1])
+    sorted_bundle_indices = [item[0] for item in sorted_bundle_node_ip_list]
+
+    if master_addr is not None:
+        preferred_indices = [idx for idx, ip in bundle_node_ip_list if ip == master_addr]
+        if preferred_indices:
+            remaining = [i for i in sorted_bundle_indices if i not in preferred_indices]
+            sorted_bundle_indices = preferred_indices + remaining
+
+    return sorted_bundle_indices
diff --git a/LlamaFactory/src/llamafactory/train/tuner.py b/LlamaFactory/src/llamafactory/train/tuner.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d9a85f87d34279a7b766b14e62037c21e4f472b
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/train/tuner.py
@@ -0,0 +1,304 @@
+# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+import torch.distributed as dist
+from transformers import EarlyStoppingCallback, PreTrainedModel
+
+from ..data import get_template_and_fix_tokenizer
+from ..extras import logging
+from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
+from ..extras.misc import find_available_port, get_device_name, get_torch_device, infer_optim_dtype
+from ..extras.packages import is_mcore_adapter_available, is_ray_available
+from ..hparams import RayArguments, get_infer_args, get_ray_args, get_train_args, read_args
+from ..model import load_model, load_tokenizer
+from .callbacks import LogCallback, PissaConvertCallback, ReporterCallback
+from .dpo import run_dpo
+from .kto import run_kto
+from .ppo import run_ppo
+from .pt import run_pt
+from .rm import run_rm
+from .sft import run_sft
+from .trainer_utils import (
+    get_placement_group,
+    get_ray_head_node_ip,
+    get_ray_remote_config_for_worker,
+    get_swanlab_callback,
+    sort_placement_group_by_node_ip,
+)
+
+
+if is_ray_available():
+    import ray
+
+
+if TYPE_CHECKING:
+    from transformers import TrainerCallback
+
+
+logger = logging.get_logger(__name__)
+
+
+def _training_function(config: dict[str, Any]) -> None:
+    args = config.get("args")
+    callbacks: list[Any] = config.get("callbacks")
+    model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
+
+    callbacks.append(LogCallback())
+    if finetuning_args.pissa_convert:
+        callbacks.append(PissaConvertCallback())
+
+    if finetuning_args.use_swanlab:
+        callbacks.append(get_swanlab_callback(finetuning_args))
+
+    if finetuning_args.early_stopping_steps is not None:
+        callbacks.append(EarlyStoppingCallback(early_stopping_patience=finetuning_args.early_stopping_steps))
+
+    callbacks.append(ReporterCallback(model_args, data_args, finetuning_args, generating_args))  # add to last
+
+    if finetuning_args.stage in ["pt", "sft", "dpo"] and finetuning_args.use_mca:
+        if not is_mcore_adapter_available():
+            raise ImportError("mcore_adapter is not installed. Please install it with `pip install mcore-adapter`.")
+        if finetuning_args.stage == "pt":
+            from .mca import run_pt as run_pt_mca
+
+            run_pt_mca(model_args, data_args, training_args, finetuning_args, callbacks)
+        elif finetuning_args.stage == "sft":
+            from .mca import run_sft as run_sft_mca
+
+            run_sft_mca(model_args, data_args, training_args, finetuning_args, callbacks)
+        elif finetuning_args.stage == "dpo":
+            from .mca import run_dpo as run_dpo_mca
+
+            run_dpo_mca(model_args, data_args, training_args, finetuning_args, callbacks)
+
+    elif finetuning_args.stage == "pt":
+        run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
+    elif finetuning_args.stage == "sft":
+        run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
+    elif finetuning_args.stage == "rm":
+        run_rm(model_args, data_args, training_args, finetuning_args, callbacks)
+    elif finetuning_args.stage == "ppo":
+        run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
+    elif finetuning_args.stage == "dpo":
+        run_dpo(model_args, data_args, training_args, finetuning_args, callbacks)
+    elif finetuning_args.stage == "kto":
+        run_kto(model_args, data_args, training_args, finetuning_args, callbacks)
+    else:
+        raise ValueError(f"Unknown task: {finetuning_args.stage}.")
+
+    if is_ray_available() and ray.is_initialized():
+        return  # if ray is intialized it will destroy the process group on return
+
+    try:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+    except Exception as e:
+        logger.warning(f"Failed to destroy process group: {e}.")
+
+
+def run_exp(args: Optional[dict[str, Any]] = None, callbacks: Optional[list["TrainerCallback"]] = None) -> None:
+    args = read_args(args)
+    if "-h" in args or "--help" in args:
+        get_train_args(args)
+
+    ray_args = get_ray_args(args)
+    callbacks = callbacks or []
+    if ray_args.use_ray:
+        _ray_training_function(ray_args, config={"args": args, "callbacks": callbacks})
+    else:
+        _training_function(config={"args": args, "callbacks": callbacks})
+
+
+def export_model(args: Optional[dict[str, Any]] = None) -> None:
+    model_args, data_args, finetuning_args, _ = get_infer_args(args)
+
+    if model_args.export_dir is None:
+        raise ValueError("Please specify `export_dir` to save model.")
+
+    if model_args.adapter_name_or_path is not None and model_args.export_quantization_bit is not None:
+        raise ValueError("Please merge adapters before quantizing the model.")
+
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    processor = tokenizer_module["processor"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    model = load_model(tokenizer, model_args, finetuning_args)  # must after fixing tokenizer to resize vocab
+
+    if getattr(model, "quantization_method", None) is not None and model_args.adapter_name_or_path is not None:
+        raise ValueError("Cannot merge adapters to a quantized model.")
+
+    if not isinstance(model, PreTrainedModel):
+        raise ValueError("The model is not a `PreTrainedModel`, export aborted.")
+
+    if getattr(model, "quantization_method", None) is not None:  # quantized model adopts float16 type
+        setattr(model.config, "torch_dtype", torch.float16)
+    else:
+        if model_args.infer_dtype == "auto":
+            output_dtype = getattr(model.config, "torch_dtype", torch.float32)
+            if output_dtype == torch.float32:  # if infer_dtype is auto, try using half precision first
+                output_dtype = infer_optim_dtype(torch.bfloat16)
+        else:
+            output_dtype = getattr(torch, model_args.infer_dtype)
+
+        setattr(model.config, "torch_dtype", output_dtype)
+        model = model.to(output_dtype)
+        logger.info_rank0(f"Convert model dtype to: {output_dtype}.")
+
+    model.save_pretrained(
+        save_directory=model_args.export_dir,
+        max_shard_size=f"{model_args.export_size}GB",
+        safe_serialization=(not model_args.export_legacy_format),
+    )
+    if model_args.export_hub_model_id is not None:
+        model.push_to_hub(
+            model_args.export_hub_model_id,
+            token=model_args.hf_hub_token,
+            max_shard_size=f"{model_args.export_size}GB",
+            safe_serialization=(not model_args.export_legacy_format),
+        )
+
+    if finetuning_args.stage == "rm":
+        if model_args.adapter_name_or_path is not None:
+            vhead_path = model_args.adapter_name_or_path[-1]
+        else:
+            vhead_path = model_args.model_name_or_path
+
+        if os.path.exists(os.path.join(vhead_path, V_HEAD_SAFE_WEIGHTS_NAME)):
+            shutil.copy(
+                os.path.join(vhead_path, V_HEAD_SAFE_WEIGHTS_NAME),
+                os.path.join(model_args.export_dir, V_HEAD_SAFE_WEIGHTS_NAME),
+            )
+            logger.info_rank0(f"Copied valuehead to {model_args.export_dir}.")
+        elif os.path.exists(os.path.join(vhead_path, V_HEAD_WEIGHTS_NAME)):
+            shutil.copy(
+                os.path.join(vhead_path, V_HEAD_WEIGHTS_NAME),
+                os.path.join(model_args.export_dir, V_HEAD_WEIGHTS_NAME),
+            )
+            logger.info_rank0(f"Copied valuehead to {model_args.export_dir}.")
+
+    try:
+        tokenizer.padding_side = "left"  # restore padding side
+        tokenizer.init_kwargs["padding_side"] = "left"
+        tokenizer.save_pretrained(model_args.export_dir)
+        if model_args.export_hub_model_id is not None:
+            tokenizer.push_to_hub(model_args.export_hub_model_id, token=model_args.hf_hub_token)
+
+        if processor is not None:
+            processor.save_pretrained(model_args.export_dir)
+            if model_args.export_hub_model_id is not None:
+                processor.push_to_hub(model_args.export_hub_model_id, token=model_args.hf_hub_token)
+
+    except Exception as e:
+        logger.warning_rank0(f"Cannot save tokenizer, please copy the files manually: {e}.")
+
+    ollama_modelfile = os.path.join(model_args.export_dir, "Modelfile")
+    with open(ollama_modelfile, "w", encoding="utf-8") as f:
+        f.write(template.get_ollama_modelfile(tokenizer))
+        logger.info_rank0(f"Ollama modelfile saved in {ollama_modelfile}")
+
+
+class Worker:
+    def __init__(self):
+        self._setup_env_visible_devices()
+
+        local_rank = os.environ.get("LOCAL_RANK", "0")
+        get_torch_device().set_device(int(local_rank))
+
+    def _setup_env_visible_devices(self) -> None:
+        RAY_NOSET_VISIBLE_DEVICES_LIST = [
+            "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES",
+            "RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES",
+        ]
+        is_ray_noset_visible_devices = any(os.environ.get(env_var, None) for env_var in RAY_NOSET_VISIBLE_DEVICES_LIST)
+        if is_ray_noset_visible_devices:
+            device_name = get_device_name().upper()
+            local_rank = ray.get_runtime_context().get_accelerator_ids()[device_name][0]
+            os.environ["LOCAL_RANK"] = local_rank
+        else:
+            os.environ["LOCAL_RANK"] = "0"
+
+    def _training_function(self, config: dict[str, Any]) -> None:
+        _training_function(config)
+
+
+def _ray_training_function(ray_args: "RayArguments", config: dict[str, Any]) -> None:
+    num_workers = ray_args.ray_num_workers
+    master_addr = ray_args.master_addr
+    master_port = ray_args.master_port
+    logger.info(f"Using ray.remote mode with {num_workers} workers for distributed training.")
+
+    # initialize ray
+    if not ray.is_initialized():
+        if ray_args.ray_init_kwargs is not None:
+            ray.init(**ray_args.ray_init_kwargs)
+        else:
+            ray.init()
+
+    # verify resources
+    device_name = get_device_name().upper()
+    total_devices = int(ray.cluster_resources().get(device_name, 0))
+    if num_workers > total_devices:
+        raise ValueError(
+            f"The number of devices in the Ray cluster ({total_devices}) should be greater than num_workers ({num_workers})."
+        )
+
+    # verify master_addr
+    if master_addr is None:
+        master_addr = get_ray_head_node_ip()
+        logger.info(f"`master_addr` is not specified, using head node ip: {master_addr}.")
+    else:
+        nodes = [node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]]
+        if master_addr not in nodes:
+            raise ValueError(f"The `master_addr` ({master_addr}) is not in Ray cluster or not alive ")
+
+    # create placementgroup for resource management
+    pg, bundle = get_placement_group(total_devices)
+    ray.get(pg.ready())
+    logger.info(f"Create placement group with {num_workers} bundles: {bundle}")
+
+    # get sorted_bundle_indices
+    sorted_bundle_indices = sort_placement_group_by_node_ip(pg, master_addr)
+
+    # get master port
+    if master_port is None:
+        master_port = find_available_port()
+        logger.info(f"`master_port` is not specified, using available port: {master_port}.")
+    master_port = str(master_port)
+
+    # backing up environment variables
+    current_env = dict(os.environ.items())
+
+    # launch workers
+    RayWorker = ray.remote(Worker)
+    workers = []
+    for rank in range(num_workers):
+        remote_config = get_ray_remote_config_for_worker(
+            placement_group=pg,
+            bundle_idx=sorted_bundle_indices[rank],
+            rank=rank,
+            world_size=num_workers,
+            master_addr=master_addr,
+            master_port=master_port,
+            env=current_env,
+        )
+        worker = RayWorker.options(**remote_config).remote()
+        workers.append(worker)
+
+    ray.get([worker._training_function.remote(config=config) for worker in workers])
+    ray.shutdown()
diff --git a/LlamaFactory/src/llamafactory/v1/__init__.py b/LlamaFactory/src/llamafactory/v1/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/accelerator/__init__.py b/LlamaFactory/src/llamafactory/v1/accelerator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/accelerator/helper.py b/LlamaFactory/src/llamafactory/v1/accelerator/helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..803ed54e34ae21a9da43820af81c41cb981ca3d6
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/accelerator/helper.py
@@ -0,0 +1,235 @@
+# Copyright 2025 Bytedance Ltd. and the LlamaFactory team.
+#
+# This code is inspired by the Bytedance's VeOmni library.
+# https://github.com/ByteDance-Seed/VeOmni/blob/v0.1.4/veomni/utils/dist_utils.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions used by the distributed interface.
+
+Including:
+- Environment info (rank, world_size, local_rank, etc.)
+- Accelerator info (device type, device count, etc.)
+- Collective communication operations (all_gather, all_reduce, broadcast)
+- Synchronize processes and ensure main-process-first execution order
+"""
+
+import os
+from collections.abc import Callable
+from contextlib import contextmanager
+from enum import Enum, unique
+from functools import lru_cache, wraps
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+from ..utils.types import ProcessGroup, Tensor, TensorLike
+
+
+@unique
+class DeviceType(str, Enum):
+    CPU = "cpu"
+    CUDA = "cuda"
+    META = "meta"
+    MPS = "mps"
+    NPU = "npu"
+    XPU = "xpu"
+
+
+@unique
+class ReduceOp(str, Enum):
+    SUM = "sum"
+    MEAN = "mean"
+    MAX = "max"
+    MIN = "min"
+
+
+def requires_accelerator(fn):
+    """Decorator to check if torch.accelerator is available.
+
+    Note: this api requires torch>=2.7.0, otherwise it will raise an AttributeError or RuntimeError
+    """
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not hasattr(torch, "accelerator"):
+            raise RuntimeError("torch.accelerator is not available, please upgrade torch to 2.7.0 or higher.")
+
+        return fn(*args, **kwargs)
+
+    return wrapper
+
+
+def is_distributed() -> bool:
+    """Check if distributed environment is available."""
+    return os.getenv("RANK") is not None
+
+
+def get_rank() -> int:
+    """Get rank."""
+    return int(os.getenv("RANK", "0"))
+
+
+def get_world_size() -> int:
+    """Get world size."""
+    return int(os.getenv("WORLD_SIZE", "1"))
+
+
+def get_local_rank() -> int:
+    """Get local rank."""
+    return int(os.getenv("LOCAL_RANK", "0"))
+
+
+def get_local_world_size() -> int:
+    """Get local world size."""
+    return int(os.getenv("LOCAL_WORLD_SIZE", "1"))
+
+
+@lru_cache
+@requires_accelerator
+def get_current_accelerator(check_available: bool = True) -> torch.device:
+    """Get current accelerator."""
+    accelerator = torch.accelerator.current_accelerator(check_available=check_available)
+    return accelerator or torch.device(DeviceType.CPU.value)
+
+
+@lru_cache
+@requires_accelerator
+def get_device_count() -> int:
+    """Get the number of available devices."""
+    return torch.accelerator.device_count()
+
+
+@requires_accelerator
+def synchronize() -> None:
+    """Synchronize all processes."""
+    torch.accelerator.synchronize()
+
+
+@requires_accelerator
+def set_device_index() -> None:
+    """Set current accelerator index to local rank."""
+    if get_current_accelerator().type != DeviceType.CPU:
+        torch.accelerator.set_device_index(get_local_rank())
+
+
+@requires_accelerator
+def get_current_device() -> torch.device:
+    """Get current accelerator device."""
+    if get_current_accelerator().type == DeviceType.CPU:
+        return torch.device(DeviceType.CPU.value)
+    else:
+        return torch.device(type=get_current_accelerator().type, index=torch.accelerator.current_device_index())
+
+
+def is_torch_cuda_available():
+    """Check if CUDA is available."""
+    return get_current_accelerator().type == DeviceType.CUDA
+
+
+def is_torch_mps_available():
+    """Check if MPS is available."""
+    return get_current_accelerator().type == DeviceType.MPS
+
+
+def is_torch_npu_available():
+    """Check if NPU is available."""
+    return get_current_accelerator().type == DeviceType.NPU
+
+
+def is_torch_xpu_available():
+    """Check if XPU is available."""
+    return get_current_accelerator().type == DeviceType.XPU
+
+
+def operate_tensorlike(fn: Callable[[...], Tensor], data: TensorLike, **kwargs) -> TensorLike:
+    """Operate tensorlike data on current accelerator."""
+    device = get_current_accelerator()
+    is_tensor = isinstance(data, torch.Tensor)
+    is_ndarray = isinstance(data, np.ndarray)
+
+    if is_tensor:
+        orig_device = data.device
+        data = data.to(device=device)
+    elif is_ndarray:
+        data = torch.from_numpy(data).to(device=device, dtype=torch.float)
+    else:
+        data = torch.tensor(data, dtype=torch.float, device=device)
+
+    result = fn(data, **kwargs)
+
+    if is_tensor:
+        return result.to(orig_device)
+    elif is_ndarray:
+        return result.cpu().numpy()
+    elif result.numel() == 1:
+        return result.item()
+    else:
+        return result.tolist()
+
+
+def get_process_group_backend() -> str:
+    """Get backend for init process group."""
+    if get_current_accelerator().type == DeviceType.NPU:
+        return "hccl"
+    elif get_current_accelerator().type == DeviceType.CUDA:
+        return "nccl"
+    else:
+        return "gloo"
+
+
+def all_gather(tensor: Tensor, group: Optional[ProcessGroup] = None) -> Tensor:
+    """Gathers the tensor from all ranks and stacks them at the first dim."""
+    world_size = get_world_size()
+    output_tensor = torch.empty(world_size * tensor.numel(), dtype=tensor.dtype, device=tensor.device)
+    dist.all_gather_into_tensor(output_tensor, tensor, group=group)
+    return output_tensor.view(-1, *tensor.size())
+
+
+def all_reduce(tensor: Tensor, op: ReduceOp = ReduceOp.MEAN, group: Optional[ProcessGroup] = None) -> Tensor:
+    """Performs all reduce in the given process group."""
+    reduce_ops = {
+        ReduceOp.MEAN: dist.ReduceOp.SUM,
+        ReduceOp.SUM: dist.ReduceOp.SUM,
+        ReduceOp.MAX: dist.ReduceOp.MAX,
+        ReduceOp.MIN: dist.ReduceOp.MIN,
+    }
+    dist.all_reduce(tensor, op=reduce_ops[op], group=group)
+    if op == ReduceOp.MEAN:  # ReduceOp.AVG is not supported by the NPU backend
+        tensor /= dist.get_world_size(group=group)
+
+    return tensor
+
+
+def broadcast(tensor: Tensor, src: int = 0, group: Optional[ProcessGroup] = None) -> Tensor:
+    """Broadcasts the tensor from the src process to all other processes."""
+    dist.broadcast(tensor, src=src, group=group)
+    return tensor
+
+
+@contextmanager
+def main_process_first(local_only: bool = True) -> None:
+    """A context manager for torch distributed environment to do something on the main process firstly."""
+    if get_world_size() > 1:
+        is_main_process = get_local_rank() == 0 if local_only else get_rank() == 0
+        try:
+            if not is_main_process:
+                dist.barrier()
+            yield
+        finally:
+            if is_main_process:
+                dist.barrier()
+    else:
+        yield
diff --git a/LlamaFactory/src/llamafactory/v1/accelerator/interface.py b/LlamaFactory/src/llamafactory/v1/accelerator/interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..e31afdc79b5751155c90c27f2c5de84b766dbf72
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/accelerator/interface.py
@@ -0,0 +1,260 @@
+# Copyright 2025 Bytedance Ltd. and the LlamaFactory team.
+#
+# This code is inspired by the Bytedance's VeOmni library.
+# https://github.com/ByteDance-Seed/VeOmni/blob/v0.1.4/veomni/distributed/parallel_state.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A unified interface for model parallelism and data parallelism.
+
+Supports model parallelism types:
+- mp_replicate: Replicate model across multiple devices.
+- mp_shard: Shard model across multiple devices.
+
+And data parallelism types:
+- dp: Data parallelism.
+- cp: Context parallelism.
+"""
+
+from dataclasses import dataclass
+from datetime import timedelta
+from enum import Enum
+from typing import Any, Optional
+
+from torch.distributed import barrier, destroy_process_group, init_process_group
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+
+from ..utils import logging
+from ..utils.types import DistributedConfig, ProcessGroup, TensorLike
+from . import helper
+
+
+logger = logging.get_logger(__name__)
+
+
+class Dim(str, Enum):
+    """Dimension names."""
+
+    MP_REPLICATE = "mp_replicate"
+    MP_SHARD = "mp_shard"
+    DP = "dp"
+    CP = "cp"
+
+
+@dataclass
+class DistributedStrategy:
+    """Distributed strategy."""
+
+    mp_replicate_size: int = 1
+    """Model parallel replicate size, default to 1."""
+    mp_shard_size: int | None = None
+    """Model parallel shard size, default to world_size // mp_replicate_size."""
+    dp_size: int | None = None
+    """Data parallel size, default to world_size // cp_size."""
+    cp_size: int = 1
+    """Context parallel size, default to 1."""
+
+    def __post_init__(self) -> None:
+        if not helper.is_distributed():
+            self.mp_shard_size = 1
+        elif self.mp_shard_size is None:
+            self.mp_shard_size = helper.get_world_size() // self.mp_replicate_size
+        elif self.mp_replicate_size * self.mp_shard_size != helper.get_world_size():
+            raise ValueError(
+                f"mp_replicate_size * mp_shard_size must equal to world_size, "
+                f"got {self.mp_replicate_size} * {self.mp_shard_size} != {helper.get_world_size()}."
+            )
+
+        if not helper.is_distributed():
+            self.dp_size = 1
+        elif self.dp_size is None:
+            self.dp_size = helper.get_world_size() // self.cp_size
+        elif self.dp_size * self.cp_size != helper.get_world_size():
+            raise ValueError(
+                f"dp_size * cp_size must equal to world_size, "
+                f"got {self.dp_size} * {self.cp_size} != {helper.get_world_size()}."
+            )
+
+    @property
+    def model_mesh_shape(self) -> tuple[int, int]:
+        """Model parallel mesh shape."""
+        return (self.mp_replicate_size, self.mp_shard_size)
+
+    @property
+    def model_mesh_dim_names(self) -> tuple[str, str]:
+        """Model parallel mesh dimension names."""
+        return (Dim.MP_REPLICATE.value, Dim.MP_SHARD.value)
+
+    @property
+    def data_mesh_shape(self) -> tuple[int, int]:
+        """Data parallel mesh shape."""
+        return (self.dp_size, self.cp_size)
+
+    @property
+    def data_mesh_dim_names(self) -> tuple[str, str]:
+        """Data parallel mesh dimension names."""
+        return (Dim.DP.value, Dim.CP.value)
+
+
+class DistributedInterface:
+    """Distributed interface."""
+
+    _instance: Optional["DistributedInterface"] = None
+    _initialized: bool = False
+
+    def __new__(cls, *args: Any, **kwargs: Any) -> "DistributedInterface":
+        """Singleton pattern."""
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+
+        return cls._instance
+
+    def __init__(self, config: DistributedConfig | None = None) -> None:
+        if self._initialized:
+            return
+
+        helper.set_device_index()
+        self._is_distributed = helper.is_distributed()
+        self._rank = helper.get_rank()
+        self._world_size = helper.get_world_size()
+        self._local_rank = helper.get_local_rank()
+        self._local_world_size = helper.get_local_world_size()
+        self.current_device = helper.get_current_device()
+        self.device_count = helper.get_device_count()
+
+        if config is None:
+            self.strategy = DistributedStrategy()
+            timeout = 18000
+        else:
+            self.strategy = DistributedStrategy(
+                mp_replicate_size=config.get("mp_replicate_size", 1),
+                mp_shard_size=config.get("mp_shard_size", None),
+                dp_size=config.get("dp_size", None),
+                cp_size=config.get("cp_size", 1),
+            )
+            timeout = config.get("timeout", 18000)
+
+        if self._is_distributed:
+            init_process_group(timeout=timedelta(seconds=timeout), backend=helper.get_process_group_backend())
+            self.model_device_mesh = init_device_mesh(
+                device_type=self.current_device.type,
+                mesh_shape=self.strategy.model_mesh_shape,
+                mesh_dim_names=self.strategy.model_mesh_dim_names,
+            )
+            self.data_device_mesh = init_device_mesh(
+                device_type=self.current_device.type,
+                mesh_shape=self.strategy.data_mesh_shape,
+                mesh_dim_names=self.strategy.data_mesh_dim_names,
+            )
+        else:
+            self.model_device_mesh = None
+            self.data_device_mesh = None
+
+        self._initialized = True
+        logger.info_rank0(f"DistributedInterface initialized: {self}.")
+
+    def __str__(self) -> str:
+        return (
+            f"DistributedInterface(strategy={self.strategy}), is_distributed={self._is_distributed}, "
+            f"current_device={self.current_device}, rank={self._rank}, world_size={self._world_size}, "
+            f"model_device_mesh={self.model_device_mesh}, data_device_mesh={self.data_device_mesh}"
+        )
+
+    def get_device_mesh(self, dim: Dim | None = None) -> DeviceMesh | None:
+        """Get device mesh for specified dimension."""
+        if dim is None:
+            raise ValueError("dim must be specified.")
+        elif not self._is_distributed:
+            return None
+        elif dim in self.strategy.data_mesh_dim_names:
+            return self.data_device_mesh[dim.value]
+        else:
+            return self.model_device_mesh[dim.value]
+
+    def get_group(self, dim: Dim | None = None) -> Optional[ProcessGroup]:
+        """Get process group for specified dimension."""
+        if not self._is_distributed or dim is None:
+            return None
+        else:
+            return self.get_device_mesh(dim).get_group()
+
+    def get_rank(self, dim: Dim | None = None) -> int:
+        """Get parallel rank for specified dimension."""
+        if not self._is_distributed:
+            return 0
+        elif dim is None:
+            return self._rank
+        else:
+            return self.get_device_mesh(dim).get_local_rank()
+
+    def get_world_size(self, dim: Dim | None = None) -> int:
+        """Get parallel size for specified dimension."""
+        if not self._is_distributed:
+            return 1
+        elif dim is None:
+            return self._world_size
+        else:
+            return self.get_device_mesh(dim).size()
+
+    def get_local_rank(self) -> int:
+        """Get parallel local rank."""
+        return self._local_rank
+
+    def get_local_world_size(self) -> int:
+        """Get parallel local world size."""
+        return self._local_world_size
+
+    def all_gather(self, data: TensorLike, dim: Dim | None = Dim.DP) -> TensorLike:
+        """Gather tensor across specified parallel group."""
+        if self._is_distributed:
+            return helper.operate_tensorlike(helper.all_gather, data, group=self.get_group(dim))
+        else:
+            return data
+
+    def all_reduce(
+        self, data: TensorLike, op: helper.ReduceOp = helper.ReduceOp.MEAN, dim: Dim | None = Dim.DP
+    ) -> TensorLike:
+        """Reduce tensor across specified parallel group."""
+        if self._is_distributed:
+            return helper.operate_tensorlike(helper.all_reduce, data, op=op, group=self.get_group(dim))
+        else:
+            return data
+
+    def broadcast(self, data: TensorLike, src: int = 0, dim: Dim | None = Dim.DP) -> TensorLike:
+        """Broadcast tensor across specified parallel group."""
+        if self._is_distributed:
+            return helper.operate_tensorlike(helper.broadcast, data, src=src, group=self.get_group(dim))
+        else:
+            return data
+
+    def sync(self) -> None:
+        """Synchronize all processes."""
+        if self._is_distributed:
+            helper.synchronize()
+
+    def barrier(self) -> None:
+        """Barrier all processes."""
+        if self._is_distributed:
+            barrier()
+
+    def destroy(self) -> None:
+        """Destroy all processes."""
+        if self._is_distributed:
+            destroy_process_group()
+
+
+if __name__ == "__main__":
+    """
+    python -m llamafactory.v1.accelerator.interface
+    """
+    print(DistributedInterface())
diff --git a/LlamaFactory/src/llamafactory/v1/accelerator/profiler.py b/LlamaFactory/src/llamafactory/v1/accelerator/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/config/__init__.py b/LlamaFactory/src/llamafactory/v1/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f334d0524d94c2a84a7ef2f1b5e30ad1fd50e414
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/config/__init__.py
@@ -0,0 +1,33 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .arg_parser import InputArgument, get_args
+from .arg_utils import BatchingStrategy, ModelClass, SampleBackend
+from .data_args import DataArguments
+from .model_args import ModelArguments
+from .sample_args import SampleArguments
+from .training_args import TrainingArguments
+
+
+__all__ = [
+    "BatchingStrategy",
+    "DataArguments",
+    "InputArgument",
+    "ModelArguments",
+    "ModelClass",
+    "SampleArguments",
+    "SampleBackend",
+    "TrainingArguments",
+    "get_args",
+]
diff --git a/LlamaFactory/src/llamafactory/v1/config/arg_parser.py b/LlamaFactory/src/llamafactory/v1/config/arg_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..2122a569f974176c29ac58180156ba0fe6542795
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/config/arg_parser.py
@@ -0,0 +1,63 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import sys
+from pathlib import Path
+from typing import Any
+
+from omegaconf import OmegaConf
+from transformers import HfArgumentParser
+
+from ..utils.env import is_env_enabled
+from .data_args import DataArguments
+from .model_args import ModelArguments
+from .sample_args import SampleArguments
+from .training_args import TrainingArguments
+
+
+InputArgument = dict[str, Any] | list[str] | None
+
+
+def get_args(args: InputArgument = None) -> tuple[ModelArguments, DataArguments, TrainingArguments, SampleArguments]:
+    """Parse arguments from command line or config file."""
+    parser = HfArgumentParser([ModelArguments, DataArguments, TrainingArguments, SampleArguments])
+    allow_extra_keys = is_env_enabled("ALLOW_EXTRA_KEYS")
+
+    if args is None:
+        if len(sys.argv) > 1 and (sys.argv[1].endswith(".yaml") or sys.argv[1].endswith(".yml")):
+            override_config = OmegaConf.from_cli(sys.argv[2:])
+            dict_config = OmegaConf.load(Path(sys.argv[1]).absolute())
+            args = OmegaConf.to_container(OmegaConf.merge(dict_config, override_config))
+        elif len(sys.argv) > 1 and sys.argv[1].endswith(".json"):
+            override_config = OmegaConf.from_cli(sys.argv[2:])
+            dict_config = OmegaConf.create(json.load(Path(sys.argv[1]).absolute()))
+            args = OmegaConf.to_container(OmegaConf.merge(dict_config, override_config))
+        else:  # list of strings
+            args = sys.argv[1:]
+
+    if isinstance(args, dict):
+        (*parsed_args,) = parser.parse_dict(args, allow_extra_keys=allow_extra_keys)
+    else:
+        (*parsed_args, unknown_args) = parser.parse_args_into_dataclasses(args, return_remaining_strings=True)
+        if unknown_args and not allow_extra_keys:
+            print(parser.format_help())
+            print(f"Got unknown args, potentially deprecated arguments: {unknown_args}")
+            raise ValueError(f"Some specified arguments are not used by the HfArgumentParser: {unknown_args}")
+
+    return tuple(parsed_args)
+
+
+if __name__ == "__main__":
+    print(get_args())
diff --git a/LlamaFactory/src/llamafactory/v1/config/arg_utils.py b/LlamaFactory/src/llamafactory/v1/config/arg_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..db52f970097ebd6637bf053871a1a093b7725c24
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/config/arg_utils.py
@@ -0,0 +1,103 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v5.0.0rc0/src/transformers/training_args.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+from enum import StrEnum, unique
+
+
+class PluginConfig(dict):
+    """Dictionary that allows attribute access."""
+
+    @property
+    def name(self) -> str:
+        """Plugin name."""
+        if "name" not in self:
+            raise ValueError("Plugin configuration must have a 'name' field.")
+
+        return self["name"]
+
+
+PluginArgument = PluginConfig | dict | str | None
+
+
+@unique
+class ModelClass(StrEnum):
+    """Auto class for model config."""
+
+    LLM = "llm"
+    CLS = "cls"
+    OTHER = "other"
+
+
+@unique
+class SampleBackend(StrEnum):
+    HF = "hf"
+    VLLM = "vllm"
+
+
+@unique
+class BatchingStrategy(StrEnum):
+    NORMAL = "normal"
+    PADDING_FREE = "padding_free"
+    DYNAMIC_BATCHING = "dynamic_batching"
+    DYNAMIC_PADDING_FREE = "dynamic_padding_free"
+
+
+def _convert_str_dict(data: dict) -> dict:
+    """Parse string representation inside the dictionary.
+
+    Args:
+        data: The string or dictionary to convert.
+
+    Returns:
+        The converted dictionary.
+    """
+    for key, value in data.items():
+        if isinstance(value, dict):
+            data[key] = _convert_str_dict(value)
+        elif isinstance(value, str):
+            if value.lower() in ("true", "false"):
+                data[key] = value.lower() == "true"
+            elif value.isdigit():
+                data[key] = int(value)
+            elif value.replace(".", "", 1).isdigit():
+                data[key] = float(value)
+
+    return data
+
+
+def get_plugin_config(config: PluginArgument) -> PluginConfig | None:
+    """Get the plugin configuration from the argument value.
+
+    Args:
+        config: The argument value to get the plugin configuration from.
+
+    Returns:
+        The plugin configuration.
+    """
+    if config is None:
+        return None
+
+    if isinstance(config, str) and config.startswith("{"):
+        config = json.loads(config)
+
+    config = _convert_str_dict(config)
+    if "name" not in config:
+        raise ValueError("Plugin configuration must have a 'name' field.")
+
+    return PluginConfig(config)
diff --git a/LlamaFactory/src/llamafactory/v1/config/data_args.py b/LlamaFactory/src/llamafactory/v1/config/data_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..8693df429ebb2be2a1a447cbdc634aea64e39c1b
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/config/data_args.py
@@ -0,0 +1,28 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class DataArguments:
+    train_dataset: str | None = field(
+        default=None,
+        metadata={"help": "Path to the training dataset."},
+    )
+    eval_dataset: str | None = field(
+        default=None,
+        metadata={"help": "Path to the evaluation dataset."},
+    )
diff --git a/LlamaFactory/src/llamafactory/v1/config/model_args.py b/LlamaFactory/src/llamafactory/v1/config/model_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ab1e561fa0432e58ac675252d9aafe8e354b189
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/config/model_args.py
@@ -0,0 +1,60 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass, field
+
+from .arg_utils import ModelClass, PluginConfig, get_plugin_config
+
+
+@dataclass
+class ModelArguments:
+    model: str = field(
+        default="Qwen/Qwen3-4B-Instruct-2507",
+        metadata={"help": "Path to the model or model identifier from Hugging Face."},
+    )
+    template: str = field(
+        default="qwen3_nothink",
+        metadata={"help": "Template for the model."},
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={"help": "Trust remote code from Hugging Face."},
+    )
+    model_class: ModelClass = field(
+        default=ModelClass.LLM,
+        metadata={"help": "Model class from Hugging Face."},
+    )
+    init_config: PluginConfig | None = field(
+        default=None,
+        metadata={"help": "Initialization configuration for the model."},
+    )
+    peft_config: PluginConfig | None = field(
+        default=None,
+        metadata={"help": "PEFT configuration for the model."},
+    )
+    kernel_config: PluginConfig | None = field(
+        default=None,
+        metadata={"help": "Kernel configuration for the model."},
+    )
+    quant_config: PluginConfig | None = field(
+        default=None,
+        metadata={"help": "Quantization configuration for the model."},
+    )
+
+    def __post_init__(self) -> None:
+        self.init_config = get_plugin_config(self.init_config)
+        self.peft_config = get_plugin_config(self.peft_config)
+        self.kernel_config = get_plugin_config(self.kernel_config)
+        self.quant_config = get_plugin_config(self.quant_config)
diff --git a/LlamaFactory/src/llamafactory/v1/config/sample_args.py b/LlamaFactory/src/llamafactory/v1/config/sample_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..3971ee71edecb61eb5c02046ca863d5833fac596
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/config/sample_args.py
@@ -0,0 +1,30 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass, field
+
+from .arg_utils import SampleBackend
+
+
+@dataclass
+class SampleArguments:
+    sample_backend: SampleBackend = field(
+        default=SampleBackend.HF,
+        metadata={"help": "Sampling backend, default to 'hf'."},
+    )
+    max_new_tokens: int = field(
+        default=128,
+        metadata={"help": "Maximum number of new tokens to generate."},
+    )
diff --git a/LlamaFactory/src/llamafactory/v1/config/training_args.py b/LlamaFactory/src/llamafactory/v1/config/training_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fe0c1cf162c6feaaca83ff5c8d1743e22c0df51
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/config/training_args.py
@@ -0,0 +1,88 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from dataclasses import dataclass, field
+from uuid import uuid4
+
+from .arg_utils import BatchingStrategy, PluginConfig, get_plugin_config
+
+
+@dataclass
+class TrainingArguments:
+    output_dir: str = field(
+        default=os.path.join("outputs", str(uuid4().hex)),
+        metadata={"help": "Path to the output directory."},
+    )
+    micro_batch_size: int = field(
+        default=1,
+        metadata={"help": "Micro batch size for training."},
+    )
+    global_batch_size: int | None = field(
+        default=None,
+        metadata={"help": "Global batch size for training, default to DP size * micro batch size."},
+    )
+    cutoff_len: int = field(
+        default=2048,
+        metadata={"help": "Maximum sequence length for training."},
+    )
+    learning_rate: float = field(
+        default=1e-4,
+        metadata={"help": "Learning rate for training."},
+    )
+    num_train_epochs: int = field(
+        default=3,
+        metadata={"help": "Number of training epochs."},
+    )
+    max_steps: int | None = field(
+        default=None,
+        metadata={"help": "Maximum number of training steps. If set, overrides num_train_epochs."},
+    )
+    max_grad_norm: float = field(
+        default=1.0,
+        metadata={"help": "Maximum gradient norm for training."},
+    )
+    bf16: bool = field(
+        default=False,
+        metadata={"help": "Use bf16 for training."},
+    )
+    batching_strategy: BatchingStrategy = field(
+        default=BatchingStrategy.NORMAL,
+        metadata={"help": "Batching strategy for training."},
+    )
+    batching_workers: int = field(
+        default=16,
+        metadata={"help": "Number of workers for batching."},
+    )
+    enable_activation_checkpointing: bool = field(
+        default=True,
+        metadata={"help": "Enable activation checkpointing for training."},
+    )
+    dist_config: PluginConfig | None = field(
+        default=None,
+        metadata={"help": "Distribution configuration for training."},
+    )
+    optim_config: PluginConfig | None = field(
+        default=None,
+        metadata={"help": "Optimizer configuration for training."},
+    )
+    lr_scheduler_config: PluginConfig | None = field(
+        default=None,
+        metadata={"help": "Learning rate scheduler configuration for training."},
+    )
+
+    def __post_init__(self) -> None:
+        self.dist_config = get_plugin_config(self.dist_config)
+        self.optim_config = get_plugin_config(self.optim_config)
+        self.lr_scheduler_config = get_plugin_config(self.lr_scheduler_config)
diff --git a/LlamaFactory/src/llamafactory/v1/core/__init__.py b/LlamaFactory/src/llamafactory/v1/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/core/base_sampler.py b/LlamaFactory/src/llamafactory/v1/core/base_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1080b3f139a2cfe3bbd535599b1cd6a301fa5ae4
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/core/base_sampler.py
@@ -0,0 +1,67 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import AsyncGenerator
+
+from ..config import ModelArguments, SampleArguments, SampleBackend
+from ..utils.types import HFModel, Message, Sample, TorchDataset
+from .utils.inference_engine import HuggingFaceEngine
+from .utils.rendering import Renderer
+
+
+class BaseSampler:
+    """Base sampler.
+
+    Args:
+        args: Sample arguments.
+        model_args: Model arguments.
+        model: Model.
+        renderer: Renderer.
+    """
+
+    def __init__(
+        self,
+        args: SampleArguments,
+        model_args: ModelArguments,
+        model: HFModel,
+        renderer: Renderer,
+    ) -> None:
+        if args.sample_backend == SampleBackend.HF:
+            self.engine = HuggingFaceEngine(args, model_args, model, renderer)
+        else:
+            raise ValueError(f"Unknown sample backend: {args.sample_backend}")
+
+    async def generate(self, messages: list[Message], tools: str | None = None) -> AsyncGenerator[str, None]:
+        """Generate tokens asynchronously.
+
+        Args:
+            messages: List of messages.
+            tools: Tools string.
+
+        Yields:
+            Generated tokens.
+        """
+        async for token in self.engine.generate(messages, tools):
+            yield token
+
+    async def batch_infer(self, dataset: TorchDataset) -> list[Sample]:
+        """Batch infer samples.
+
+        Args:
+            dataset: Torch dataset.
+
+        Returns:
+            List of samples.
+        """
+        return await self.engine.batch_infer(dataset)
diff --git a/LlamaFactory/src/llamafactory/v1/core/base_trainer.py b/LlamaFactory/src/llamafactory/v1/core/base_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d97c80731c5b451beca81a2f54acb38377bb6a8
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/core/base_trainer.py
@@ -0,0 +1,209 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The definition of trainer.
+
+Init Phase:
+
+1. Init batch generator.
+2. Init optimizer (deepspeed).
+3. Shard model.
+4. Init optimizer (fsdp).
+5. Init lr scheduler.
+
+Train Phase:
+1. Train Loop
+
+"""
+
+from abc import abstractmethod
+
+import torch
+import torch.nn.functional as F
+
+from ..accelerator.helper import ReduceOp
+from ..accelerator.interface import Dim, DistributedInterface
+from ..config import TrainingArguments
+from ..utils import logging
+from ..utils.helper import compute_valid_tokens
+from ..utils.types import BatchInput, HFModel, ModelOutput, Tensor, TorchDataset
+from .utils.batching import BatchGenerator
+from .utils.rendering import Renderer
+
+
+logger = logging.get_logger(__name__)
+
+
+class BaseTrainer:
+    def __init__(
+        self,
+        args: TrainingArguments,
+        model: HFModel,
+        renderer: Renderer,
+        train_dataset: TorchDataset,
+    ) -> None:
+        self.args = args
+        self.model = model
+        self.renderer = renderer
+        self.train_dataset = train_dataset
+
+        # info
+        self.global_step = 0
+
+        # cached variables
+        self.device = DistributedInterface().current_device
+        self.dp_size = DistributedInterface().get_world_size(Dim.DP)
+        self.model_input_names = self.renderer.processor.model_input_names
+
+        self._create_batch_generator()
+        # Calculate num_training_steps: max_steps takes priority if set
+        if self.args.max_steps is not None and self.args.max_steps > 0:
+            self.num_training_steps = self.args.max_steps
+        else:
+            self.num_training_steps = self.args.num_train_epochs * len(self.train_batch_generator)
+
+        if self.args.enable_activation_checkpointing:
+            self.model.gradient_checkpointing_enable({"use_reentrant": False})
+
+        if self.args.dist_config is not None:
+            shard_need_optimizer = self.args.dist_config.name == "deepspeed"
+        else:
+            shard_need_optimizer = False
+
+        if shard_need_optimizer:
+            self._init_optimizer()
+            self._shard_model()
+        else:
+            self._shard_model()
+            self._init_optimizer()
+
+        self._init_lr_scheduler()
+
+    def _create_batch_generator(self) -> None:
+        self.train_batch_generator = BatchGenerator(
+            dataset=self.train_dataset,
+            renderer=self.renderer,
+            micro_batch_size=self.args.micro_batch_size,
+            global_batch_size=self.args.global_batch_size,
+            cutoff_len=self.args.cutoff_len,
+            batching_workers=self.args.batching_workers,
+            batching_strategy=self.args.batching_strategy,
+        )
+
+    def _shard_model(self) -> None:
+        if self.args.dist_config is None:
+            if DistributedInterface().get_world_size(Dim.DP) > 1:
+                from torch.nn.parallel import DistributedDataParallel as DDP
+
+                logger.warning_rank0(
+                    "dist_config is None but distributed training is enabled; falling back to DistributedDataParallel."
+                )
+                device_ids = None if self.device.type == "cpu" else [self.device.index]
+                self.model = DDP(self.model, device_ids=device_ids)
+        else:
+            from ..plugins.trainer_plugins.distributed.hub import DistributedPlugin
+
+            self.model = DistributedPlugin(self.args.dist_config.name)(
+                self.model,
+                self.args.dist_config,
+            )
+
+    def _init_optimizer(self) -> None:
+        """Init optimizer."""
+        if self.args.optim_config is None:
+            _trainable_params = [p for p in self.model.parameters() if p.requires_grad]
+            self.optimizer = torch.optim.AdamW(_trainable_params, lr=self.args.learning_rate)
+        else:
+            from ..plugins.trainer_plugins.optimizer import OptimizerPlugin
+
+            self.optimizer = OptimizerPlugin(self.args.optim_config.name)(self.model, self.args.optim_config)
+
+    def _init_lr_scheduler(self) -> None:
+        """Init lr scheduler."""
+        if self.args.lr_scheduler_config is None:
+            self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lambda x: 1.0)
+        else:
+            from ..plugins.trainer_plugins.lr_scheduler import LRSchedulerPlugin
+
+            self.lr_scheduler = LRSchedulerPlugin(self.args.lr_scheduler_config.name)(
+                self.optimizer, self.num_training_steps, self.args.lr_scheduler_config
+            )
+
+    def compute_log_probs(self, model: HFModel, batch: BatchInput) -> Tensor:
+        """Compute log probs.
+
+        log_probs: Tensor of shape (batch_size, seq_len - 1)
+        """
+        batch_size, _ = batch["labels"].shape
+        model_inputs = {
+            k: v.to(self.device, non_blocking=True) for k, v in batch.items() if k in self.model_input_names
+        }
+        labels = batch["labels"].to(self.device, non_blocking=True)
+        outputs: ModelOutput = model(**model_inputs)
+        logits = outputs.logits.float()
+        shift_labels = labels[..., 1:].contiguous().view(-1)
+        shift_logits = logits[..., :-1, :].contiguous().view(shift_labels.size(0), -1)
+        return -F.cross_entropy(shift_logits, shift_labels, reduction="none").view(batch_size, -1)
+
+    @abstractmethod
+    def compute_loss(self, batch: BatchInput) -> Tensor:
+        """Compute the scalar loss."""
+        ...
+
+    def fit(self) -> None:
+        """Train the model."""
+        self.model.train()
+        for epoch in range(self.args.num_train_epochs):
+            self.train_batch_generator.set_epoch(epoch)
+            for micro_batches in self.train_batch_generator:
+                self.global_step += 1
+                step_loss = 0
+                step_valid_tokens = compute_valid_tokens(micro_batches)
+                step_valid_tokens = DistributedInterface().all_reduce(step_valid_tokens, op=ReduceOp.SUM)
+                for micro_batch in micro_batches:
+                    loss = self.compute_loss(micro_batch)
+                    mini_step_valid_tokens = compute_valid_tokens([micro_batch])
+                    # fsdp uses mean reduction so we need to scale the loss by dp_size
+                    loss = loss * mini_step_valid_tokens * self.dp_size / (step_valid_tokens + 1e-6)
+
+                    loss.backward()
+                    step_loss += loss.item()
+
+                grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm).item()
+
+                # isfinite(): argument 'input' (position 1) must be Tensor, not float
+                if not torch.isfinite(torch.tensor(grad_norm)):  # type: ignore # pyright: ignore [reportUnknownReturnType]
+                    logger.warning_rank0(f"Gradient norm is not finite: {grad_norm}")
+                else:
+                    self.optimizer.step()
+
+                self.lr_scheduler.step()
+                self.optimizer.zero_grad()
+
+                step_loss, grad_norm = DistributedInterface().all_reduce([step_loss, grad_norm])
+                DistributedInterface().sync()
+                if DistributedInterface().get_rank() == 0:
+                    print(f"Epoch {epoch}, Step {self.global_step}, Loss: {step_loss:.4f}, Grad Norm: {grad_norm:.4f}")
+
+                # Check if max_steps is reached
+                if self.global_step >= self.num_training_steps:
+                    logger.info_rank0(f"Reached max_steps ({self.num_training_steps}), stopping training.")
+                    return
+
+    def save_model(self) -> None:
+        """Save the model."""
+        model_to_save = self.model.module if hasattr(self.model, "module") else self.model
+        model_to_save.save_pretrained(self.args.output_dir)
+        self.renderer.processor.save_pretrained(self.args.output_dir)
+        logger.info_rank0(f"Model saved to {self.args.output_dir}")
diff --git a/LlamaFactory/src/llamafactory/v1/core/data_engine.py b/LlamaFactory/src/llamafactory/v1/core/data_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2cbf536e4ea746cbce3bee0eea3b1937a16cc8b
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/core/data_engine.py
@@ -0,0 +1,196 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The definition of data engine.
+
+How to use:
+data_engine = DataEngine(data_args.train_dataset)
+data_engine[i]: Get the sample via index.
+
+Init workflow:
+1. Parse dataset info from arguments.
+2. Load datasets according to dataset info.
+3. Build data index (and reweight samples if necessary).
+
+Get data sample:
+1. Get sample from data index.
+2. Convert sample to standard format.
+3. Return sample.
+
+Note:
+1. The data engine is equivalent to the torch dataset.
+2. The data engine is agnostic to the model used.
+"""
+
+import os
+from collections.abc import Iterable
+from typing import Any
+
+from huggingface_hub import hf_hub_download
+from omegaconf import OmegaConf
+from torch.utils.data import Dataset
+
+from ..utils.types import DatasetInfo, HFDataset, Sample
+
+
+class DataEngine(Dataset):
+    """Data engine.
+
+    Args:
+        data_args: Data arguments.
+    """
+
+    def __init__(self, dataset_path: str) -> None:
+        self.path = dataset_path
+        """Dataset path."""
+        self.datasets: dict[str, HFDataset] = {}
+        """Dict of (dataset_name, dataset)"""
+        self.dataset_infos: dict[str, DatasetInfo] = {}
+        """Dict of (dataset_name, dataset_info)"""
+        self.data_index: list[tuple[str, int]] = []
+        """List of (dataset_name, sample_index)"""
+        self.streaming: bool = False
+        """Whether dataset is streaming."""
+        self._get_dataset_info()
+        self._load_dataset()
+        self._build_data_index()
+
+    def _get_dataset_info(self) -> None:
+        """Get dataset info from data arguments."""
+        if self.path.endswith(".yaml") and os.path.isfile(self.path):  # local file
+            self.dataset_infos = OmegaConf.load(self.path)
+        elif self.path.endswith(".yaml"):  # hf hub uri, e.g. llamafactory/v1-sft-demo/dataset_info.yaml
+            repo_id, filename = os.path.split(self.path)
+            filepath = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset")
+            self.dataset_infos = OmegaConf.load(filepath)
+        elif os.path.exists(self.path):  # local file(s)
+            self.dataset_infos = {"default": {"path": self.path, "source": "local"}}
+        else:  # hf hub dataset, e.g. llamafactory/v1-sft-demo
+            self.dataset_infos = {"default": {"path": self.path}}
+
+    def _load_dataset(self) -> None:
+        """Load datasets according to dataset info."""
+        is_streaming = [dataset_info.get("streaming", False) for dataset_info in self.dataset_infos.values()]
+        self.streaming = any(is_streaming)
+        if all(is_streaming) != any(is_streaming):
+            raise ValueError("All datasets must be streaming or non-streaming.")
+
+        for dataset_name, dataset_info in self.dataset_infos.items():
+            split = dataset_info.get("split", "train")
+            if dataset_info.get("source", "hf_hub") == "hf_hub":
+                from datasets import load_dataset
+
+                self.datasets[dataset_name] = load_dataset(dataset_info["path"], split=split, streaming=self.streaming)
+            else:  # data loader plugin
+                from ..plugins.data_plugins.loader import DataLoaderPlugin
+
+                self.datasets[dataset_name] = DataLoaderPlugin(dataset_info["source"]).load(dataset_info)
+
+    def _build_data_index(self) -> None:
+        """Build dataset index."""
+        for dataset_name, dataset in self.datasets.items():
+            if self.streaming:
+                data_index = [(dataset_name, -1) for _ in range(1000)]
+            else:
+                data_index = [(dataset_name, sample_index) for sample_index in range(len(dataset))]
+
+            size = self.dataset_infos[dataset_name].get("size")
+            weight = self.dataset_infos[dataset_name].get("weight")
+            if size or weight:
+                from ..plugins.data_plugins.loader import adjust_data_index
+
+                data_index = adjust_data_index(data_index, size, weight)
+
+            self.data_index.extend(data_index)
+
+    def _convert_data_sample(self, raw_sample: dict[str, Any], dataset_name: str) -> Sample:
+        """Convert dataset sample.
+
+        Args:
+            raw_sample (dict[str, Any]): Raw dataset sample.
+            dataset_name (str): Dataset name.
+
+        Returns:
+            Sample: Dataset sample.
+        """
+        converter = self.dataset_infos[dataset_name].get("converter")
+        if converter is not None:
+            from ..plugins.data_plugins.converter import DataConverterPlugin
+
+            return {"_dataset_name": dataset_name, **DataConverterPlugin(converter)(raw_sample)}
+        else:
+            return {"_dataset_name": dataset_name, **raw_sample}
+
+    def __len__(self) -> int:
+        """Get dataset length.
+
+        Returns:
+            int: Dataset length.
+        """
+        if self.streaming:
+            return -1
+        else:
+            return len(self.data_index)
+
+    def __getitem__(self, index: int | Any) -> Sample | list[Sample]:
+        """Get dataset item.
+
+        Args:
+            index (int): Dataset index.
+
+        Returns:
+            Sample: Dataset item.
+        """
+        if self.streaming:
+            raise ValueError("Streaming dataset does not support index access.")
+
+        if isinstance(index, int):
+            dataset_name, sample_index = self.data_index[index]
+            return self._convert_data_sample(self.datasets[dataset_name][sample_index], dataset_name)
+        else:  # data selector plugin
+            from ..plugins.data_plugins.loader import select_data_sample
+
+            selected_index = select_data_sample(self.data_index, index)
+            if isinstance(selected_index, list):
+                return [
+                    self._convert_data_sample(self.datasets[dataset_name][sample_index], dataset_name)
+                    for dataset_name, sample_index in selected_index
+                ]
+            else:
+                dataset_name, sample_index = selected_index
+                return self._convert_data_sample(self.datasets[dataset_name][sample_index], dataset_name)
+
+    def __iter__(self) -> Iterable[Sample]:
+        """Get dataset iterator.
+
+        Returns:
+            Iterable[Sample]: Dataset iterator.
+        """
+        # NOTE: hf iterable dataset uses worker ids while map dataset does not
+        # NOTE: add worker id and shuffle to the map dataset
+        # https://github.com/huggingface/datasets/blob/4.0.0/src/datasets/iterable_dataset.py#L2214
+
+        raise NotImplementedError()
+
+
+if __name__ == "__main__":
+    """
+    python -m llamafactory.v1.core.data_engine --train_dataset data/v1_sft_demo.yaml
+    python -m llamafactory.v1.core.data_engine --train_dataset data/v1_dpo_demo.yaml
+    """
+    from ..config.arg_parser import get_args
+
+    _, data_args, *_ = get_args()
+    data_engine = DataEngine(data_args.train_dataset)
+    print(data_engine[0])
diff --git a/LlamaFactory/src/llamafactory/v1/core/model_engine.py b/LlamaFactory/src/llamafactory/v1/core/model_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a5a00204f415c818e8e9148be1ee61f67737821
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/core/model_engine.py
@@ -0,0 +1,160 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The definition of model engine.
+
+How to use:
+model_engine = ModelEngine(model_args, is_train=True)
+model_engine.processor: Get the tokenizer or multi-modal processor.
+model_engine.renderer: Get the renderer.
+model_engine.model_config: Get the model configuration.
+model_engine.model: Get the HF model.
+
+Init workflow:
+1. Init processor.
+2. Init render.
+2. Init model config.
+3. Init model.
+4. Init adapter.
+"""
+
+import torch
+from accelerate import init_empty_weights
+from transformers import AutoConfig, AutoProcessor
+
+from ..accelerator.helper import DeviceType
+from ..accelerator.interface import DistributedInterface
+from ..config.model_args import ModelArguments, ModelClass
+from ..utils import logging
+from ..utils.types import HFConfig, HFModel, Processor
+from .utils.rendering import Renderer
+
+
+logger = logging.get_logger(__name__)
+
+
+class ModelEngine:
+    """Model engine.
+
+    Args:
+        model_args: Model arguments.
+        is_train: Whether to train the model.
+    """
+
+    def __init__(self, model_args: ModelArguments, is_train: bool = False) -> None:
+        self.args = model_args
+        """Model arguments."""
+        self.is_train = is_train
+        """Whether to train the model."""
+        self.processor = self._init_processor()
+        """Tokenizer or multi-modal processor."""
+        self.renderer = Renderer(self.args.template, self.processor)
+        """Renderer."""
+        self.model_config = self._init_model_config()
+        """Model configuration."""
+        self.model = self._init_model()
+        """HF model."""
+
+    def _init_processor(self) -> Processor:
+        """Init processor.
+
+        NOTE: Transformers v5 always use fast tokenizer.
+        https://github.com/huggingface/transformers/blob/v5.0.0rc1/src/transformers/models/auto/tokenization_auto.py#L642
+        """
+        return AutoProcessor.from_pretrained(
+            self.args.model,
+            trust_remote_code=self.args.trust_remote_code,
+        )
+
+    def _init_model_config(self) -> HFConfig:
+        """Init model config."""
+        return AutoConfig.from_pretrained(
+            self.args.model,
+            trust_remote_code=self.args.trust_remote_code,
+        )
+
+    def _init_model(self) -> HFModel:
+        """Init model.
+
+        Transformers can choose the proper model init context.
+        https://github.com/huggingface/transformers/blob/v5.0.0rc0/src/transformers/modeling_utils.py#L3538
+        """
+        if self.args.model_class == ModelClass.LLM:
+            from transformers import AutoModelForCausalLM, AutoModelForImageTextToText
+
+            if type(self.model_config) in AutoModelForImageTextToText._model_mapping.keys():
+                AutoClass = AutoModelForImageTextToText
+            else:
+                AutoClass = AutoModelForCausalLM
+
+        elif self.args.model_class == ModelClass.CLS:
+            from transformers import AutoModelForTokenClassification
+
+            AutoClass = AutoModelForTokenClassification
+        else:
+            from transformers import AutoModel
+
+            AutoClass = AutoModel
+
+        if self.args.init_config is not None:
+            from ..plugins.model_plugins.initialization import InitPlugin
+
+            init_device = InitPlugin(self.args.init_config.name)()
+        else:
+            init_device = DistributedInterface().current_device
+
+        if init_device.type == DeviceType.META:
+            with init_empty_weights():
+                model = AutoClass.from_config(self.model_config)
+        else:
+            model = AutoClass.from_pretrained(
+                self.args.model,
+                config=self.model_config,
+                dtype="auto",
+                device_map=init_device,
+                trust_remote_code=self.args.trust_remote_code,
+            )
+
+        if self.args.peft_config is None:
+            if self.is_train:
+                logger.info_rank0("Fine-tuning mode: full tuning")
+                model = model.to(torch.float32)
+            else:
+                logger.info_rank0("Inference the original model")
+        else:
+            from ..plugins.model_plugins.peft import PeftPlugin
+
+            model = PeftPlugin(self.args.peft_config.name)(model, self.args.peft_config, self.is_train)
+
+        if self.args.kernel_config is not None:
+            from ..plugins.model_plugins.kernels.interface import KernelPlugin
+
+            model = KernelPlugin(self.args.kernel_config.name)(
+                model, include_kernels=self.args.kernel_config.get("include_kernels")
+            )
+
+        return model
+
+
+if __name__ == "__main__":
+    """
+    python -m llamafactory.v1.core.model_engine --model llamafactory/tiny-random-qwen2.5
+    """
+    from ..config.arg_parser import get_args
+
+    model_args, *_ = get_args()
+    model_engine = ModelEngine(model_args=model_args)
+    print(model_engine.processor)
+    print(model_engine.model_config)
+    print(model_engine.model)
diff --git a/LlamaFactory/src/llamafactory/v1/core/utils/__init__.py b/LlamaFactory/src/llamafactory/v1/core/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/core/utils/batching.py b/LlamaFactory/src/llamafactory/v1/core/utils/batching.py
new file mode 100644
index 0000000000000000000000000000000000000000..e87a959747b99af14c9c3b9e419b974f869356c4
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/core/utils/batching.py
@@ -0,0 +1,244 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Batching utils supports stateful dataloader.
+
+1. Init stateful dataloader (tokenize)
+2. Add to buffer
+3. Yield batch indexes (micro batch * grad acc)
+    a) non pack + non dynamic
+    b) non pack + dynamic
+    c) pack + non dynamic
+    d) pack + dynamic
+"""
+
+from collections.abc import Iterator
+from typing import Any
+
+from torch.utils.data import default_collate
+from torchdata.stateful_dataloader import StatefulDataLoader
+from torchdata.stateful_dataloader.sampler import StatefulDistributedSampler
+
+from ...accelerator.interface import Dim, DistributedInterface
+from ...config import BatchingStrategy
+from ...utils import logging
+from ...utils.helper import pad_and_truncate
+from ...utils.objects import StatefulBuffer
+from ...utils.types import BatchInfo, BatchInput, ModelInput, TorchDataset
+from .rendering import Renderer
+
+
+logger = logging.get_logger(__name__)
+
+
+def default_collate_fn(buffer: StatefulBuffer, batch_info: BatchInfo) -> list[BatchInput] | None:
+    micro_batch_size = batch_info["micro_batch_size"]
+    num_micro_batch = batch_info["num_micro_batch"]
+    cutoff_len = batch_info["cutoff_len"]
+    batch_size = micro_batch_size * num_micro_batch
+    if len(buffer) < batch_size:
+        return None
+
+    samples = buffer.get(batch_size)
+    batch = []
+    for i in range(num_micro_batch):
+        micro_batch = samples[i * micro_batch_size : (i + 1) * micro_batch_size]
+        batch.append(default_collate(pad_and_truncate(micro_batch, cutoff_len)))
+
+    return batch
+
+
+class BatchGenerator(Iterator):
+    def __init__(
+        self,
+        dataset: TorchDataset,
+        renderer: Renderer,
+        micro_batch_size: int = 1,
+        global_batch_size: int | None = None,
+        cutoff_len: int = 2048,
+        batching_workers: int = 0,
+        batching_strategy: BatchingStrategy = BatchingStrategy.NORMAL,
+        pin_memory: bool = True,
+        drop_last: bool = True,
+    ) -> None:
+        self.dataset = dataset
+        self.renderer = renderer
+
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
+        self.cutoff_len = cutoff_len
+        self.batching_workers = batching_workers
+        self.batching_strategy = batching_strategy
+        self.pin_memory = pin_memory
+        self.drop_last = drop_last
+        # TODO: support length and infinity
+        dp_size = DistributedInterface().get_world_size(Dim.DP)
+
+        if self.global_batch_size is None:
+            self.global_batch_size = dp_size * micro_batch_size
+            self.num_micro_batch = 1
+        elif self.global_batch_size % (dp_size * micro_batch_size) == 0:
+            self.num_micro_batch = global_batch_size // dp_size // micro_batch_size
+        else:
+            raise ValueError(
+                "Global batch size must be divisible by DP size and micro batch size. "
+                f"Got {global_batch_size} % ({dp_size} * {micro_batch_size}) != 0."
+            )
+
+        if not self.drop_last:
+            raise ValueError("Drop last must be True.")
+
+        self._init_data_provider()
+
+        self._is_resuming: bool = False
+        self._data_iter = iter(self._data_provider)
+        self._buffer = StatefulBuffer()
+
+        self._batch_info: BatchInfo = {
+            "micro_batch_size": self.micro_batch_size,
+            "num_micro_batch": self.num_micro_batch,
+            "cutoff_len": self.cutoff_len,
+            "data_iter": self._data_iter,
+        }
+
+        logger.info_rank0(
+            f"Init unified data loader with global batch size {self.global_batch_size}, "
+            f"micro batch size {self.micro_batch_size}, "
+            f"num micro batch {self.num_micro_batch}, "
+            f"cutoff len {self.cutoff_len}, "
+            f"batching workers {self.batching_workers}, "
+            f"batching strategy {self.batching_strategy}."
+        )
+
+    def _init_data_provider(self) -> None:
+        if len(self.dataset) != -1:
+            sampler = StatefulDistributedSampler(
+                self.dataset,
+                num_replicas=DistributedInterface().get_world_size(Dim.DP),
+                rank=DistributedInterface().get_rank(Dim.DP),
+                shuffle=True,
+                seed=0,
+                drop_last=self.drop_last,
+            )
+        else:
+            raise NotImplementedError("Iterable dataset is not supported yet.")
+
+        self._data_provider = StatefulDataLoader(
+            self.dataset,
+            batch_size=self.micro_batch_size * self.num_micro_batch,
+            sampler=sampler,
+            num_workers=self.batching_workers,
+            collate_fn=self.renderer.process_samples,
+            pin_memory=self.pin_memory,
+            pin_memory_device=DistributedInterface().current_device.type,
+            drop_last=self.drop_last,
+        )
+        if self.batching_strategy == BatchingStrategy.NORMAL:
+            self._length = len(self._data_provider)
+        else:
+            from ...plugins.trainer_plugins.batching import BatchingPlugin
+
+            self._length = BatchingPlugin(self.batching_strategy).compute_length(self._data_provider)
+            raise NotImplementedError("Batching strategy other than NORMAL is not supported yet.")
+
+    def __len__(self) -> int:
+        return self._length
+
+    def __iter__(self):
+        if not self._is_resuming:
+            self._buffer.clear()
+            self._buffer_tokens = 0
+
+        self._data_iter = iter(self._data_provider)
+        self._is_resuming = False
+        return self
+
+    def __next__(self):
+        self._fill_buffer()
+        batch = self._generate_batch()
+        if batch is None:
+            raise StopIteration
+
+        return batch
+
+    def _fill_buffer(self) -> None:
+        if self.batching_strategy == BatchingStrategy.NORMAL:
+            while len(self._buffer) < self.micro_batch_size * self.num_micro_batch:
+                try:
+                    samples: list[ModelInput] = next(self._data_iter)
+                except StopIteration:
+                    break
+
+                self._buffer.put(samples)
+        else:
+            from ...plugins.trainer_plugins.batching import BatchingPlugin
+
+            BatchingPlugin(self.batching_strategy).fill_buffer(self._buffer, self._batch_info)
+
+    def _generate_batch(self) -> list[BatchInput] | None:
+        if self.batching_strategy == BatchingStrategy.NORMAL:
+            return default_collate_fn(self._buffer, self._batch_info)
+        else:
+            from ...plugins.trainer_plugins.batching import BatchingPlugin
+
+            return BatchingPlugin(self.batching_strategy).generate_batch(self._buffer, self._batch_info)
+
+    def state_dict(self) -> dict[str, Any]:
+        return {
+            "buffer": self._buffer,
+            "buffer_tokens": self._buffer_tokens,
+            "data_provider": self._data_provider.state_dict(),
+        }
+
+    def load_state_dict(self, state: dict[str, Any]) -> None:
+        self._buffer = state["buffer"]
+        self._buffer_tokens = state["buffer_tokens"]
+        self._data_provider.load_state_dict(state["data_provider"])
+        self._is_resuming = True
+
+    def set_epoch(self, epoch: int) -> None:
+        if hasattr(self._data_provider.sampler, "set_epoch"):
+            self._data_provider.sampler.set_epoch(epoch)
+
+
+if __name__ == "__main__":
+    """
+    python -m llamafactory.v1.core.utils.batching \
+        --model llamafactory/tiny-random-qwen2.5 \
+        --train_dataset data/v1_sft_demo.yaml \
+        --micro_batch_size 2 \
+        --global_batch_size 4 \
+        --batching_workers 0
+    """
+    from ...config.arg_parser import get_args
+    from ..data_engine import DataEngine
+    from ..model_engine import ModelEngine
+
+    model_args, data_args, training_args, _ = get_args()
+    data_engine = DataEngine(data_args.train_dataset)
+    model_engine = ModelEngine(model_args=model_args)
+    batch_generator = BatchGenerator(
+        data_engine,
+        model_engine.renderer,
+        micro_batch_size=training_args.micro_batch_size,
+        global_batch_size=training_args.global_batch_size,
+        cutoff_len=training_args.cutoff_len,
+        batching_workers=training_args.batching_workers,
+        batching_strategy=training_args.batching_strategy,
+    )
+    for batch in batch_generator:
+        print(batch)
+        print(len(batch))
+        print(batch[0]["input_ids"].shape)
+        break
diff --git a/LlamaFactory/src/llamafactory/v1/core/utils/callback.py b/LlamaFactory/src/llamafactory/v1/core/utils/callback.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/core/utils/inference_engine.py b/LlamaFactory/src/llamafactory/v1/core/utils/inference_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..b090dae814a4271636b5105bc016454b54421735
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/core/utils/inference_engine.py
@@ -0,0 +1,121 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+from abc import ABC, abstractmethod
+from collections.abc import AsyncGenerator
+from threading import Thread
+
+import torch
+from transformers import AsyncTextIteratorStreamer
+
+from ...accelerator.interface import DistributedInterface
+from ...config import ModelArguments, SampleArguments
+from ...utils.helper import get_tokenizer
+from ...utils.types import HFModel, Message, Sample, TorchDataset
+from .rendering import Renderer
+
+
+class BaseEngine(ABC):
+    @abstractmethod
+    def __init__(
+        self,
+        args: SampleArguments,
+        model_args: ModelArguments,
+        model: HFModel,
+        renderer: Renderer,
+    ) -> None:
+        """Initialize the engine.
+
+        Args:
+            args: Sample arguments.
+            model_args: Model arguments.
+            model: Model.
+            renderer: Renderer.
+        """
+        ...
+
+    @abstractmethod
+    async def generate(self, messages: list[Message], tools: str | None = None) -> AsyncGenerator[str, None]:
+        """Generate tokens asynchronously.
+
+        Args:
+            messages: List of messages.
+            tools: Tools string.
+
+        Yields:
+            Generated tokens.
+        """
+        ...
+
+    @abstractmethod
+    async def batch_infer(self, dataset: TorchDataset) -> list[Sample]:
+        """Batch infer samples.
+
+        Args:
+            dataset: Torch dataset.
+
+        Returns:
+            List of samples.
+        """
+        ...
+
+
+class HuggingFaceEngine(BaseEngine):
+    def __init__(
+        self,
+        args: SampleArguments,
+        model_args: ModelArguments,
+        model: HFModel,
+        renderer: Renderer,
+    ) -> None:
+        self.args = args
+        self.model_args = model_args
+        self.model = model
+        self.renderer = renderer
+        self.semaphore = asyncio.Semaphore(int(os.getenv("MAX_CONCURRENT", "1")))
+
+    @torch.inference_mode()
+    async def generate(self, messages: list[Message], tools: str | None = None) -> AsyncGenerator[str, None]:
+        async with self.semaphore:
+            model_inputs = self.renderer.render_messages(messages, tools, is_generate=True)
+            streamer = AsyncTextIteratorStreamer(
+                tokenizer=get_tokenizer(self.renderer.processor),
+                skip_prompt=True,
+                skip_special_tokens=True,  # TODO: configurable
+            )
+            device = DistributedInterface().current_device
+            kwargs = {
+                "input_ids": torch.tensor([model_inputs["input_ids"]]).to(device),
+                "attention_mask": torch.tensor([model_inputs["attention_mask"]]).to(device),
+                "max_new_tokens": self.args.max_new_tokens,
+                "streamer": streamer,
+            }
+            thread = Thread(target=self.model.generate, kwargs=kwargs, daemon=True)
+            thread.start()
+
+            async for token in streamer:
+                yield token
+
+    async def batch_infer(self, dataset: TorchDataset) -> list[Sample]:
+        """Batch infer samples.
+
+        Args:
+            dataset: Torch dataset.
+
+        Returns:
+            List of samples.
+        """
+        raise NotImplementedError("Batch infer is not implemented.")
diff --git a/LlamaFactory/src/llamafactory/v1/core/utils/rendering.py b/LlamaFactory/src/llamafactory/v1/core/utils/rendering.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbe8383c4cc0b94213ca7cc7af1bfc0d3f67dc46
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/core/utils/rendering.py
@@ -0,0 +1,169 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Rendering utils.
+
+How to use:
+renderer = Renderer(template, processor)
+renderer.render_messages(messages: list[Message], tools: str | None) -> ModelInputs
+renderer.parse_message(text: str) -> Message
+renderer.process_samples(samples: list[Sample]) -> list[ModelInput]
+"""
+
+import numpy as np
+
+from ...utils.constants import IGNORE_INDEX
+from ...utils.helper import get_tokenizer
+from ...utils.types import Message, ModelInput, Processor, Sample
+
+
+def render_chatml_messages(
+    processor: Processor,
+    messages: list[Message],
+    tools: str | None = None,
+    is_generate: bool = False,
+) -> ModelInput:
+    """Apply chatml template to messages and convert them to model input.
+
+    See https://huggingface.co/spaces/huggingfacejs/chat-template-playground?modelId=Qwen/Qwen2-7B-Instruct
+    """
+    tokenizer = get_tokenizer(processor)
+    input_ids, labels, loss_weights = [], [], []
+
+    for message in messages:
+        temp_str = "<|im_start|>" + message["role"] + "\n"
+        for content in message["content"]:
+            if content["type"] == "text":
+                temp_str += content["value"]
+            else:
+                raise ValueError(f"Unsupported content type: {content['type']}")
+
+        temp_str += "<|im_end|>\n"
+        temp_weight = message.get("loss_weight", 1.0 if message["role"] == "assistant" else 0.0)
+        temp_ids = tokenizer.encode(temp_str, add_special_tokens=False)
+        input_ids.extend(temp_ids)
+        loss_weights.extend([temp_weight] * len(temp_ids))
+        if temp_weight > 1e-6:
+            labels.extend(temp_ids)
+        else:
+            labels.extend([IGNORE_INDEX] * len(temp_ids))
+
+    if is_generate:
+        temp_ids = tokenizer.encode("<|im_start|>assistant\n", add_special_tokens=False)
+        input_ids.extend(temp_ids)
+        loss_weights.extend([0.0] * len(temp_ids))
+        labels.extend([IGNORE_INDEX] * len(temp_ids))
+
+    return ModelInput(
+        input_ids=input_ids,
+        attention_mask=[1] * len(input_ids),
+        labels=labels,
+        loss_weights=loss_weights,
+    )
+
+
+def parse_chatml_message(generated_text: str) -> Message:
+    """Parse a message in ChatML format.
+
+    Args:
+        generated_text (str): The generated text in ChatML format.
+
+    Returns:
+        Message: The parsed message.
+    """
+    return Message(role="assistant", content=[{"type": "text", "value": generated_text}])
+
+
+class Renderer:
+    def __init__(self, template: str, processor: Processor):
+        self.template = template
+        self.processor = processor
+
+    def render_messages(
+        self, messages: list[Message], tools: str | None = None, is_generate: bool = False
+    ) -> ModelInput:
+        """Apply template to messages and convert them to model input.
+
+        Args:
+            messages (list[Message]): The messages to render.
+            tools (str | None, optional): The tools to use. Defaults to None.
+            is_generate (bool, optional): Whether to render for generation. Defaults to False.
+
+        Returns:
+            ModelInput: The rendered model input.
+        """
+        if self.template == "chatml":
+            return render_chatml_messages(self.processor, messages, tools, is_generate)
+        else:
+            from ...plugins.model_plugins.rendering import RenderingPlugin
+
+            return RenderingPlugin(self.template).render_messages(self.processor, messages, tools, is_generate)
+
+    def parse_message(self, generated_text: str) -> Message:
+        """Parse a message in the template format.
+
+        Args:
+            generated_text (str): The generated text in the template format.
+
+        Returns:
+            Message: The parsed message.
+        """
+        if self.template == "chatml":
+            return parse_chatml_message(generated_text)
+        else:
+            from ...plugins.model_plugins.rendering import RenderingPlugin
+
+            return RenderingPlugin(self.template).parse_message(generated_text)
+
+    def process_samples(self, samples: list[Sample]) -> list[ModelInput]:
+        """Process samples to model input.
+
+        Args:
+            samples (list[Sample]): The samples to process.
+
+        Returns:
+            list[ModelInput]: The processed model inputs.
+        """
+        model_inputs = []
+        for sample in samples:
+            if "messages" in sample:
+                model_input = self.render_messages(sample["messages"], sample.get("tools"))
+            elif "chosen_messages" in sample and "rejected_messages" in sample:
+                chosen_input = self.render_messages(sample["chosen_messages"], sample.get("tools"))
+                rejected_input = self.render_messages(sample["rejected_messages"], sample.get("tools"))
+                chosen_input["token_type_ids"] = [1] * len(chosen_input["input_ids"])
+                rejected_input["token_type_ids"] = [2] * len(rejected_input["input_ids"])
+                model_input = ModelInput(
+                    input_ids=chosen_input["input_ids"] + rejected_input["input_ids"],
+                    attention_mask=chosen_input["attention_mask"] + rejected_input["attention_mask"],
+                    labels=chosen_input["labels"] + rejected_input["labels"],
+                    loss_weights=chosen_input["loss_weights"] + rejected_input["loss_weights"],
+                    token_type_ids=chosen_input["token_type_ids"] + rejected_input["token_type_ids"],
+                )
+                if "position_ids" in chosen_input:
+                    model_input["position_ids"] = np.concatenate(
+                        [chosen_input["position_ids"], rejected_input["position_ids"]], axis=-1
+                    )
+            else:
+                raise ValueError("No valid messages or chosen_messages/rejected_messages found in sample.")
+
+            if "extra_info" in sample:
+                model_input["extra_info"] = sample["extra_info"]
+
+            if "_dataset_name" in sample:
+                model_input["_dataset_name"] = sample["_dataset_name"]
+
+            model_inputs.append(model_input)
+
+        return model_inputs
diff --git a/LlamaFactory/src/llamafactory/v1/launcher.py b/LlamaFactory/src/llamafactory/v1/launcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9b077b1c3d54c9d91204910134afaa519c54b16
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/launcher.py
@@ -0,0 +1,179 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+from copy import deepcopy
+
+
+USAGE = (
+    "-" * 70
+    + "\n"
+    + "| Usage:                                                             |\n"
+    + "|   llamafactory-cli sft -h: train models                            |\n"
+    + "|   llamafactory-cli version: show version info                      |\n"
+    + "| Hint: You can use `lmf` as a shortcut for `llamafactory-cli`.      |\n"
+    + "-" * 70
+)
+
+_DIST_TRAIN_COMMANDS = ("train", "sft", "dpo", "rm")
+
+
+def launch():
+    from .accelerator.helper import get_device_count
+    from .utils.env import find_available_port, is_env_enabled, use_kt, use_ray
+    from .utils.logging import get_logger
+
+    logger = get_logger(__name__)
+
+    # NOTE:
+    # `llamafactory-cli <command> ...` enters here first.
+    # We may re-launch via `torchrun` for distributed training. In that case we must
+    # forward `<command>` as argv[1] to the re-executed script, otherwise the script
+    # will misinterpret the first user argument (e.g. yaml config) as the command.
+    command = sys.argv.pop(1) if len(sys.argv) > 1 else "help"
+
+    if command in _DIST_TRAIN_COMMANDS and (
+        is_env_enabled("FORCE_TORCHRUN") or (get_device_count() > 1 and not use_ray() and not use_kt())
+    ):
+        nnodes = os.getenv("NNODES", "1")
+        node_rank = os.getenv("NODE_RANK", "0")
+        nproc_per_node = os.getenv("NPROC_PER_NODE", str(get_device_count()))
+        master_addr = os.getenv("MASTER_ADDR", "127.0.0.1")
+        master_port = os.getenv("MASTER_PORT", str(find_available_port()))
+        logger.info_rank0(f"Initializing {nproc_per_node} distributed tasks at: {master_addr}:{master_port}")
+        if int(nnodes) > 1:
+            logger.info_rank0(f"Multi-node training enabled: num nodes: {nnodes}, node rank: {node_rank}")
+
+        # elastic launch support
+        max_restarts = os.getenv("MAX_RESTARTS", "0")
+        rdzv_id = os.getenv("RDZV_ID")
+        min_nnodes = os.getenv("MIN_NNODES")
+        max_nnodes = os.getenv("MAX_NNODES")
+
+        env = deepcopy(os.environ)
+        if is_env_enabled("OPTIM_TORCH", "1"):
+            # optimize DDP, see https://zhuanlan.zhihu.com/p/671834539
+            env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+            env["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+        torchrun_args = [
+            "torchrun",
+            "--nproc-per-node",
+            nproc_per_node,
+        ]
+        if rdzv_id is not None:
+            # launch elastic job with fault tolerant support when possible
+            # see also https://docs.pytorch.org/docs/stable/elastic/train_script.html
+            rdzv_nnodes = nnodes
+            # elastic number of nodes if MIN_NNODES and MAX_NNODES are set
+            if min_nnodes is not None and max_nnodes is not None:
+                rdzv_nnodes = f"{min_nnodes}:{max_nnodes}"
+
+            torchrun_args.extend(
+                [
+                    "--nnodes",
+                    rdzv_nnodes,
+                    "--rdzv-id",
+                    rdzv_id,
+                    "--rdzv-backend",
+                    "c10d",
+                    "--rdzv-endpoint",
+                    f"{master_addr}:{master_port}",
+                    "--max-restarts",
+                    max_restarts,
+                ]
+            )
+        else:
+            # NOTE: DO NOT USE shell=True to avoid security risk
+            torchrun_args.extend(
+                [
+                    "--nnodes",
+                    nnodes,
+                    "--node_rank",
+                    node_rank,
+                    "--master_addr",
+                    master_addr,
+                    "--master_port",
+                    master_port,
+                ]
+            )
+
+        script_args = [__file__, command] + sys.argv[1:]
+        process = subprocess.run(
+            torchrun_args + script_args,
+            env=env,
+            check=True,
+        )
+
+        sys.exit(process.returncode)
+
+    elif command == "chat":
+        from .samplers.cli_sampler import run_chat
+
+        run_chat()
+
+    elif command == "env":
+        raise NotImplementedError("Environment information is not implemented yet.")
+
+    elif command == "version":
+        raise NotImplementedError("Version information is not implemented yet.")
+
+    elif command == "help":
+        print(USAGE)
+
+    elif command in _DIST_TRAIN_COMMANDS:
+        # Single GPU training without torchrun
+        if command in ("train", "sft"):
+            from llamafactory.v1.trainers.sft_trainer import run_sft
+
+            run_sft()
+        elif command == "dpo":
+            raise NotImplementedError("DPO trainer is not implemented yet.")
+        elif command == "rm":
+            raise NotImplementedError("RM trainer is not implemented yet.")
+
+    else:
+        print(f"Unknown command: {command}.\n{USAGE}")
+
+
+def main():
+    # sys.argv[1] contains the command (sft/dpo/rm/train), sys.argv[2:] contains the rest args
+    command = sys.argv[1] if len(sys.argv) > 1 else "sft"
+
+    # Routing needs the sub-command, but downstream trainers usually expect argv without it.
+    if command in _DIST_TRAIN_COMMANDS:
+        sys.argv.pop(1)
+    else:
+        # Backward-compat: if someone runs `torchrun launcher.py config.yaml`,
+        # treat it as sft by default.
+        if len(sys.argv) > 1 and sys.argv[1].endswith((".yaml", ".yml")):
+            command = "sft"
+    if command in ("train", "sft"):
+        from llamafactory.v1.trainers.sft_trainer import run_sft
+
+        run_sft()
+    elif command == "dpo":
+        # from llamafactory.v1.trainers.dpo_trainer import run_dpo
+        # run_dpo()
+        raise NotImplementedError("DPO trainer is not implemented yet.")
+    elif command == "rm":
+        # from llamafactory.v1.trainers.rm_trainer import run_rm
+        # run_rm()
+        raise NotImplementedError("RM trainer is not implemented yet.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/__init__.py b/LlamaFactory/src/llamafactory/v1/plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/data_plugins/__init__.py b/LlamaFactory/src/llamafactory/v1/plugins/data_plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/data_plugins/converter.py b/LlamaFactory/src/llamafactory/v1/plugins/data_plugins/converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7075fe5dcb46336613121b98d06599b15cd69a19
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/data_plugins/converter.py
@@ -0,0 +1,223 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+from typing import Any, Literal, NotRequired, TypedDict
+
+from ...utils import logging
+from ...utils.plugin import BasePlugin
+from ...utils.types import DPOSample, Sample, SFTSample, ToolCall
+
+
+logger = logging.get_logger(__name__)
+
+
+class AlpacaSample(TypedDict, total=False):
+    system: NotRequired[str]
+    instruction: str
+    input: NotRequired[str]
+    output: str
+
+
+SharegptMessage = TypedDict(
+    "SharegptMessage",
+    {"from": Literal["human", "gpt", "system", "function_call", "observation"], "value": str},
+)
+
+
+class SharegptSample(TypedDict, total=False):
+    conversations: list[SharegptMessage]
+    tools: NotRequired[str]
+
+
+class OpenaiMessage(TypedDict, total=False):
+    role: Literal["user", "assistant", "tool"]
+    content: str
+
+
+class OpenaiSample(TypedDict, total=False):
+    messages: list[OpenaiMessage]
+
+
+class PairSample(TypedDict, total=False):
+    chosen: list[OpenaiMessage]
+    rejected: list[OpenaiMessage]
+
+
+class DataConverterPlugin(BasePlugin):
+    """Plugin for data converters."""
+
+    def __call__(self, raw_sample: dict[str, Any]) -> Sample:
+        return super().__call__(raw_sample)
+
+
+@DataConverterPlugin("alpaca").register()
+def alpaca_converter(raw_sample: AlpacaSample) -> SFTSample:
+    """Convert Alpaca sample to SFT sample.
+
+    See raw example at: https://huggingface.co/datasets/llamafactory/alpaca_gpt4_en
+
+    Args:
+        raw_sample (AlpacaSample): Alpaca sample.
+
+    Returns:
+        SFTSample: SFT sample.
+    """
+    messages = []
+    if "system" in raw_sample:
+        messages.append(
+            {"role": "system", "content": [{"type": "text", "value": raw_sample["system"]}], "loss_weight": 0.0}
+        )
+
+    if "instruction" in raw_sample or "input" in raw_sample:
+        messages.append(
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "value": raw_sample.get("instruction", "") + raw_sample.get("input", "")}
+                ],
+                "loss_weight": 0.0,
+            }
+        )
+
+    if "output" in raw_sample:
+        messages.append(
+            {"role": "assistant", "content": [{"type": "text", "value": raw_sample["output"]}], "loss_weight": 1.0}
+        )
+
+    return {"messages": messages}
+
+
+@DataConverterPlugin("sharegpt").register()
+def sharegpt_converter(raw_sample: SharegptSample) -> SFTSample:
+    """Convert ShareGPT sample to SFT sample.
+
+    See raw example at: https://huggingface.co/datasets/llamafactory/glaive_toolcall_en
+
+    Args:
+        raw_sample (SharegptSample): ShareGPT sample.
+
+    Returns:
+        SFTSample: SFT sample.
+    """
+    tag_mapping = {
+        "system": "system",
+        "human": "user",
+        "gpt": "assistant",
+        "observation": "tool",
+        "function_call": "assistant",
+    }
+    sample = {}
+    messages = []
+    for message in raw_sample.get("conversations", []):
+        tag = message["from"]
+        if tag not in tag_mapping:
+            logger.warning_rank0(f"Unsupported role tag {tag} in message: {message}")
+        elif tag == "function_call":
+            try:
+                tool_calls: ToolCall | list[ToolCall] = json.loads(message["value"])
+            except json.JSONDecodeError:
+                logger.warning_rank0(f"Invalid tool call format: {str(message['value'])}")
+                continue
+
+            if not isinstance(tool_calls, list):
+                tool_calls = [tool_calls]
+
+            messages.append(
+                {
+                    "role": "assistant",
+                    "content": [{"type": "tool_call", "value": json.dumps(tool_call)} for tool_call in tool_calls],
+                    "loss_weight": 1.0,
+                }
+            )
+        else:
+            messages.append(
+                {
+                    "role": tag_mapping[tag],
+                    "content": [{"type": "text", "value": message["value"]}],
+                    "loss_weight": 1.0 if tag == "gpt" else 0.0,
+                }
+            )
+
+    sample["messages"] = messages
+
+    tools = raw_sample.get("tools")
+    if tools:
+        try:
+            tools: list[dict[str, Any]] = json.loads(tools)
+            sample["tools"] = json.dumps(tools)
+        except json.JSONDecodeError:
+            logger.warning_rank0(f"Invalid tools format: {str(tools)}")
+
+    return sample
+
+
+@DataConverterPlugin("pair").register()
+def pair_converter(raw_sample: PairSample) -> DPOSample:
+    """Convert Pair sample to DPO sample.
+
+    See raw example at: https://huggingface.co/datasets/HuggingFaceH4/orca_dpo_pairs
+
+    Args:
+        raw_sample (PairSample): pair sample with chosen, rejected fields.
+
+    Returns:
+        DPOSample: DPO sample with chosen_messages and rejected_messages.
+    """
+
+    def process_message(raw_messages: list[OpenaiMessage]):
+        messages = []
+        for message in raw_messages:
+            if message["role"] == "tool":
+                try:
+                    tool_calls: ToolCall | list[ToolCall] = json.loads(message["content"])
+                except json.JSONDecodeError:
+                    logger.warning_rank0(f"Invalid tool call format: {str(message['content'])}")
+                    continue
+
+                if not isinstance(tool_calls, list):
+                    tool_calls = [tool_calls]
+
+                messages.append(
+                    {
+                        "role": message["role"],
+                        "content": [{"type": "tool_call", "value": json.dumps(tool_call)} for tool_call in tool_calls],
+                        "loss_weight": 1.0 if message["role"] == "assistant" else 0.0,
+                    }
+                )
+            else:
+                messages.append(
+                    {
+                        "role": message["role"],
+                        "content": [{"type": "text", "value": message["content"]}],
+                        "loss_weight": 1.0 if message["role"] == "assistant" else 0.0,
+                    }
+                )
+
+        return messages
+
+    sample = {}
+    sample["chosen_messages"] = process_message(raw_sample.get("chosen", []))
+    sample["rejected_messages"] = process_message(raw_sample.get("rejected", []))
+
+    tools = raw_sample.get("tools")
+    if tools:
+        try:
+            tools: list[dict[str, Any]] = json.loads(tools)
+            sample["tools"] = json.dumps(tools)
+        except json.JSONDecodeError:
+            logger.warning_rank0(f"Invalid tools format: {str(tools)}")
+
+    return sample
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/data_plugins/loader.py b/LlamaFactory/src/llamafactory/v1/plugins/data_plugins/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb24ca614ae749bd497c6c4fde935885af44a831
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/data_plugins/loader.py
@@ -0,0 +1,108 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import random
+from typing import Any, Literal
+
+from datasets import load_dataset
+
+from ...utils.plugin import BasePlugin
+from ...utils.types import DatasetInfo, HFDataset
+
+
+class DataLoaderPlugin(BasePlugin):
+    """Plugin for loading dataset."""
+
+    def load(self, dataset_info: DatasetInfo) -> HFDataset:
+        path = dataset_info["path"]
+        split = dataset_info.get("split", "train")
+        streaming = dataset_info.get("streaming", False)
+        return super().__call__(path, split, streaming)
+
+
+def _get_builder_name(path: str) -> Literal["arrow", "csv", "json", "parquet", "text"]:
+    """Get dataset builder name.
+
+    Args:
+        path (str): Dataset path.
+
+    Returns:
+        Literal["arrow", "csv", "json", "parquet", "text"]: Dataset builder name.
+    """
+    filetype = os.path.splitext(path)[-1][1:]
+    if filetype in ["arrow", "csv", "json", "jsonl", "parquet", "txt"]:
+        return filetype.replace("jsonl", "json").replace("txt", "text")
+    else:
+        raise ValueError(f"Unknown dataset filetype: {filetype}.")
+
+
+@DataLoaderPlugin("local").register()
+def load_data_from_file(filepath: str, split: str, streaming: bool) -> HFDataset:
+    if os.path.isdir(filepath):
+        filetype = _get_builder_name(os.listdir(filepath)[0])
+        dataset = load_dataset(filetype, data_dir=filepath, split=split)
+    elif os.path.isfile(filepath):
+        filetype = _get_builder_name(filepath)
+        dataset = load_dataset(filetype, data_files=filepath, split=split)
+    else:
+        raise ValueError(f"Can not load dataset from {filepath}.")
+
+    if streaming:  # faster when data is streamed from local files
+        dataset = dataset.to_iterable_dataset()
+
+    return dataset
+
+
+def adjust_data_index(
+    data_index: list[tuple[str, int]], size: int | None, weight: float | None
+) -> list[tuple[str, int]]:
+    """Adjust dataset index by size and weight.
+
+    Args:
+        data_index (list[tuple[str, int]]): List of (dataset_name, sample_index).
+        size (Optional[int]): Desired dataset size.
+        weight (Optional[float]): Desired dataset weight.
+
+    Returns:
+        list[tuple[str, int]]: Adjusted dataset index.
+    """
+    if size is not None:
+        data_index = random.choices(data_index, k=size)
+
+    if weight is not None:
+        data_index = random.choices(data_index, k=int(len(data_index) * weight))
+
+    return data_index
+
+
+def select_data_sample(
+    data_index: list[tuple[str, int]], index: slice | list[int] | Any
+) -> tuple[str, int] | list[tuple[str, int]]:
+    """Select dataset samples.
+
+    Args:
+        data_index (list[tuple[str, int]]): List of (dataset_name, sample_index).
+        index (Union[slice, list[int], Any]): Index of dataset samples.
+
+    Returns:
+        Union[tuple[str, int], list[tuple[str, int]]]: Selected dataset samples.
+    """
+    if isinstance(index, slice):
+        return [data_index[i] for i in range(*index.indices(len(data_index)))]
+    elif isinstance(index, list):
+        return [data_index[i] for i in index]
+    else:
+        raise ValueError(f"Invalid index type {type(index)}.")
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/__init__.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/add_token.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/add_token.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/initialization.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/initialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..efb8f22f79f6c6e5abbb2d0410cd928abbf1d120
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/initialization.py
@@ -0,0 +1,43 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+from ...accelerator.helper import DeviceType
+from ...accelerator.interface import DistributedInterface
+from ...utils.plugin import BasePlugin
+
+
+class InitPlugin(BasePlugin):
+    def __call__(self) -> torch.device:
+        return super().__call__()
+
+
+@InitPlugin("init_on_meta").register()
+def init_on_meta() -> torch.device:
+    return torch.device(DeviceType.META.value)
+
+
+@InitPlugin("init_on_rank0").register()
+def init_on_rank0() -> torch.device:
+    if DistributedInterface().get_rank() == 0:
+        return torch.device(DeviceType.CPU.value)
+    else:
+        return torch.device(DeviceType.META.value)
+
+
+@InitPlugin("init_on_default").register()
+def init_on_default() -> torch.device:
+    return DistributedInterface().current_device
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/__init__.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/base.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..265986ccc87f8ad6993f4606eb5895f129e616f7
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/base.py
@@ -0,0 +1,87 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The definition of base kernel class.
+
+Init Phase:
+1. Define base kernel class.
+2. Define abstract methods.
+
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any
+
+from ....accelerator.helper import DeviceType, get_current_accelerator
+from ....utils.types import HFModel
+
+
+class BaseKernel(ABC):
+    r"""Base class for all kernel implementations.
+
+    Subclasses must implement the abstract methods and define the required class attributes.
+    """
+
+    _kernel_id: Any = ""  # kernel ID, any hashable value to identify a kernel implementation
+    _device: DeviceType = DeviceType.CPU  # "cuda", "npu", "cpu", etc.
+
+    @classmethod
+    def get_kernel_id(cls) -> str:
+        """Returns the unique identifier for the kernel."""
+        return cls._kernel_id
+
+    @classmethod
+    def get_device(cls) -> str:
+        """Returns the device type associated with the kernel (e.g., "cuda", "npu", "cpu")."""
+        return cls._device
+
+    @classmethod
+    def check_deps(cls) -> bool:
+        """Checks if the required dependencies for the kernel are available.
+
+        Returns:
+            bool: ``True`` if dependencies are met, ``False`` otherwise.
+
+        .. note::
+            In explicit mode, if a user specifies an implementation but this check fails,
+            it should raise an error instead of silently switching.
+            Kernels can override this method to implement custom dependency checks.
+        """
+        if cls._device != get_current_accelerator().type:
+            return False
+        return True
+
+    @classmethod
+    @abstractmethod
+    def apply(cls, **kwargs) -> HFModel:
+        """Applies the kernel optimization to the model.
+
+        Args:
+            **kwargs: Arbitrary keyword arguments, usually containing the model instance and the kernel configuration.
+
+        Returns:
+            HFModel: The model with the kernel applied.
+
+        Raises:
+            RuntimeError: If the kernel dependencies are not met.
+            NotImplementedError: If the method is not implemented by the subclass.
+
+        Example:
+            >>> from llamafactory.v1.plugins.model_plugins.kernels.interface import apply_kernel
+            >>> model = HFModel(config=config)
+            >>> model = apply_kernel(model=model, kernel_id="npu_fused_moe")
+        """
+        if not cls.check_deps():
+            raise RuntimeError(f"{cls.__name__} is not available but {cls.__name__} kernel was called.")
+        raise NotImplementedError
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/interface.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..7967a43283d34f376f51ff3d9041dfe7caa709f8
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/interface.py
@@ -0,0 +1,140 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The definition of kernel interface.
+
+Init Phase:
+1. Scan all kernels.
+2. Register default kernels.
+3. Define kernel plugin.
+
+"""
+
+import importlib
+from pathlib import Path
+
+from ....utils import logging
+from ....utils.plugin import BasePlugin
+from ....utils.types import HFModel
+from .registry import Registry
+
+
+logger = logging.get_logger(__name__)
+
+
+def scan_all_kernels():
+    """Scan all kernels in the ``ops`` directory.
+
+    Scans the ``ops`` directory for all ``.py`` files and attempts to import them.
+    Importing triggers the :func:`~registry.register_kernel` decorator, which automatically registers the kernels.
+
+    Returns:
+        dict[str, type[BaseKernel]]: A dictionary of registered kernels.
+
+    .. note::
+        This function assumes that the ``ops`` directory is located in the same directory as this file.
+        It recursively searches for ``.py`` files and constructs the module path for import.
+    """
+    ops_path = Path(__file__).parent / "ops"
+
+    if not ops_path.exists():
+        return
+
+    base_package = __package__
+
+    for file_path in ops_path.rglob("*.py"):
+        if file_path.name == "__init__.py":
+            continue
+
+        # calculate the relative path:
+        # file_path = .../kernels_v2/ops/mlp/npu_swiglu.py
+        # rel_path  = ops/mlp/npu_swiglu.py
+        rel_path = file_path.relative_to(Path(__file__).parent)
+
+        # build module path:
+        module_name = ".".join(rel_path.parts)[:-3]
+        full_module_name = f"{base_package}.{module_name}"
+
+        try:
+            importlib.import_module(full_module_name)
+        except Exception as e:
+            logger.warning(f"[Kernel Registry] Failed to import {full_module_name} when loading kernels: {e}")
+
+    return Registry.get_registered_kernels()
+
+
+default_kernels = scan_all_kernels()
+
+
+def get_default_kernels():
+    """Get a list of default registered kernel IDs.
+
+    Returns:
+        list[str]: List of kernel IDs.
+    """
+    return list(default_kernels.keys())
+
+
+def apply_kernel(kernel_id: str, **kwargs):
+    """Applies a specific kernel to the model.
+
+    Args:
+        kernel_id (str): The ID of the kernel to apply.
+        **kwargs: Keyword arguments passed to the kernel application function.
+                  Typically includes the model instance.
+
+    Returns:
+        HFModel: The model with applied kernel.
+    """
+    kernel = default_kernels.get(kernel_id)
+    if kernel is None:
+        raise ValueError(f"Kernel {kernel_id} not found")
+
+    kernel.apply(**kwargs)
+
+
+class KernelPlugin(BasePlugin):
+    """Plugin for managing kernel optimizations."""
+
+    pass
+
+
+@KernelPlugin("auto").register()
+def apply_default_kernels(model: HFModel, include_kernels: str = None) -> HFModel:
+    """Applies all default registered kernels to the model.
+
+    Args:
+        model (HFModel): The model instance to apply kernels to.
+        include_kernels (str, optional): Comma-separated list of kernel IDs to apply.
+                                         If "auto" or True, applies all default kernels.
+                                         If None or False, no kernels are applied.
+                                         Defaults to None.
+
+    Returns:
+        HFModel: The model with applied kernels.
+    """
+    if not include_kernels:
+        return model
+    elif include_kernels == "auto" or include_kernels is True:
+        use_kernels = default_kernels.keys()
+    else:
+        use_kernels = include_kernels.split(",")  # "kernel_id1,kernel_id2,kernel_id3"
+
+    for kernel in use_kernels:
+        if kernel not in default_kernels:
+            raise ValueError(f"Kernel {kernel} not found")
+
+        apply_kernel(kernel, model=model)
+
+    return model
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/__init__.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/mlp/__init__.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/mlp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/mlp/npu_fused_moe.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/mlp/npu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b4e292697603c1ffedd397c293d62ed43361e05
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/mlp/npu_fused_moe.py
@@ -0,0 +1,343 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The definition of NPU fused MoE kernels.
+
+Init Phase:
+1. Define GMM functions.
+2. Define NPU fused MoE functions.
+3. Register NPU fused MoE kernel.
+
+"""
+
+import types
+
+import torch
+import torch.nn.functional as F
+
+
+try:
+    import torch_npu
+except ImportError:
+    pass
+
+from ......accelerator.helper import DeviceType
+from ......utils.packages import is_transformers_version_greater_than
+from ......utils.types import HFModel
+from ...base import BaseKernel
+from ...registry import register_kernel
+
+
+class GmmFunction(torch.autograd.Function):
+    """Custom autograd function for NPU Grouped Matrix Multiplication (GMM)."""
+
+    @staticmethod
+    def forward(ctx, x, weight, group_list):
+        """Performs the forward pass of Grouped Matrix Multiplication.
+
+        Args:
+            ctx: Context object to save tensors for backward pass.
+            x (Tensor): Input tensor.
+            weight (Tensor): Weight tensor.
+            group_list (list): List of group sizes.
+
+        Returns:
+            Tensor: The result of the grouped matrix multiplication.
+        """
+        ctx.save_for_backward(x, weight)
+        ctx.group_list = group_list
+
+        fwd_output = torch_npu.npu_grouped_matmul(
+            [x], [weight], bias=None, group_list=group_list, split_item=2, group_type=0, group_list_type=1
+        )[0]
+        return fwd_output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Performs the backward pass of Grouped Matrix Multiplication.
+
+        Args:
+            ctx: Context object containing saved tensors.
+            grad_output (Tensor): Gradient with respect to the output.
+
+        Returns:
+            tuple: Gradients with respect to input, weight, and None for group_list.
+        """
+        input_tensor, weight = ctx.saved_tensors
+        group_list = ctx.group_list
+
+        weight = torch.transpose(weight, 1, 2)
+        grad_input = torch_npu.npu_grouped_matmul(
+            [grad_output], [weight], bias=None, group_list=group_list, split_item=2, group_type=0, group_list_type=1
+        )[0]
+        grad_weight = torch_npu.npu_grouped_matmul(
+            [input_tensor.T],
+            [grad_output],
+            bias=None,
+            group_list=group_list,
+            split_item=3,
+            group_type=2,
+            group_list_type=1,
+        )[0]
+        return grad_input, grad_weight, None
+
+
+class HybridGmmFunction(torch.autograd.Function):
+    """Custom autograd function for Hybrid Grouped Matrix Multiplication on NPU."""
+
+    @staticmethod
+    def forward(ctx, num_experts, *args):
+        """Performs the forward pass of Hybrid GMM.
+
+        Args:
+            ctx: Context object to save tensors.
+            num_experts (int): Number of experts.
+            *args: Variable length argument list containing inputs and weights.
+
+        Returns:
+            tuple: The outputs of the grouped matrix multiplication.
+        """
+        x_list = list(args[:num_experts])
+        weight_list = list(args[num_experts:])
+
+        split_sizes = [x.shape[0] for x in x_list]
+        ctx.split_sizes = split_sizes
+        ctx.num_experts = num_experts
+
+        ctx.save_for_backward(*args)
+
+        outputs = torch_npu.npu_grouped_matmul(
+            x_list, weight_list, bias=None, group_list=None, split_item=0, group_type=-1
+        )
+        return tuple(outputs)
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        """Performs the backward pass of Hybrid GMM.
+
+        Args:
+            ctx: Context object containing saved tensors.
+            *grad_outputs: Gradients with respect to the outputs.
+
+        Returns:
+            tuple: Gradients with respect to inputs and weights.
+        """
+        saved_tensors = ctx.saved_tensors
+        num_experts = ctx.num_experts
+        split_sizes = ctx.split_sizes
+
+        x_list = list(saved_tensors[:num_experts])
+        weight_list = list(saved_tensors[num_experts:])
+
+        grad_outputs_contiguous = [g.contiguous() for g in grad_outputs]
+
+        w_t_list = [w.t() for w in weight_list]
+        grad_x_list = torch_npu.npu_grouped_matmul(
+            grad_outputs_contiguous,  # List[Tensor], 每个 [M_i, N]
+            w_t_list,  # List[Tensor], 每个 [N, K] (view)
+            bias=None,
+            group_list=None,
+            split_item=0,
+            group_type=-1,
+        )
+
+        x_concat = torch.cat(x_list, dim=0)
+        dy_concat = torch.cat(grad_outputs_contiguous, dim=0)  # [Total_M, N]
+
+        group_list = torch.tensor(split_sizes, device=x_concat.device, dtype=torch.int64)
+
+        grad_w_stack = torch_npu.npu_grouped_matmul(
+            [x_concat.t()],
+            [dy_concat],
+            bias=None,
+            group_list=group_list,
+            split_item=3,
+            group_type=2,
+            group_list_type=1,
+        )[0]
+
+        if grad_w_stack.dim() == 3:
+            grad_w_list = list(torch.unbind(grad_w_stack, dim=0))
+        else:
+            raise RuntimeError(f"Unexpected grad_w_stack shape: {grad_w_stack.shape}")
+
+        return (None, *grad_x_list, *grad_w_list)
+
+
+class NpuMoeFused:
+    """Container for NPU fused MoE forward functions."""
+
+    @staticmethod
+    def npu_moe_experts_forward(
+        self, hidden_states: torch.Tensor, routing_weights: torch.Tensor, router_indices: torch.Tensor
+    ) -> torch.Tensor:
+        """Forward pass for MoE experts using NPU fused operations.
+
+        Args:
+            self: The MoE layer instance.
+            hidden_states (Tensor): Input hidden states.
+            routing_weights (Tensor): Routing weights.
+            router_indices (Tensor): Router indices.
+
+        Returns:
+            Tensor: Output tensor after expert computation.
+        """
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)
+        permuted_hidden_states, row_ids_map = torch_npu.npu_moe_token_permute(
+            hidden_states, router_indices.to(torch.int32)
+        )
+        tokens_per_expert = torch.histc(router_indices, bins=self.num_experts, min=0, max=self.num_experts)
+        intermediate_hidden_states = GmmFunction.apply(permuted_hidden_states, self.gate_up_proj, tokens_per_expert)
+        intermediate_activations = torch_npu.npu_swiglu(intermediate_hidden_states, dim=-1)
+        output = GmmFunction.apply(intermediate_activations, self.down_proj, tokens_per_expert)
+        next_states = torch_npu.npu_moe_token_unpermute(output, row_ids_map, probs=routing_weights)
+        next_states = next_states.view(batch_size, -1, self.hidden_size)
+        return next_states
+
+    @staticmethod
+    def npu_moe_sparse_block_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        r"""Forward pass for sparse MoE block using NPU optimization.
+
+        Args:
+            self: The MoE sparse block instance.
+            hidden_states (Tensor): Input hidden states.
+
+        Returns:
+            Tensor: The routed output.
+        """
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)
+        router_logits = self.gate(hidden_states)
+        routing_weights = torch.nn.functional.softmax(router_logits, dim=-1, dtype=torch.float)
+        routing_weights, router_indices = torch.topk(routing_weights, self.top_k, dim=-1)
+        routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        hidden_states = hidden_states.reshape(batch_size, -1, self.hidden_size)
+        routed_out = self.experts(hidden_states, routing_weights, router_indices)
+        return routed_out
+
+
+class Qwen3NpuMoeFused:
+    """Container for Qwen3 NPU fused MoE forward functions."""
+
+    @staticmethod
+    def qwen3moe_sparse_moe_block_forward(self, hidden_states: torch.Tensor):
+        """Forward pass for Qwen3 sparse MoE block using NPU fused operations.
+
+        Args:
+            self: The Qwen3 MoE block instance.
+            hidden_states (Tensor): Input hidden states.
+
+        Returns:
+            tuple: A tuple containing the next states and router logits.
+        """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+
+        if self.norm_topk_prob:
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        permuted_hidden_states, row_ids_map = torch_npu.npu_moe_token_permute(hidden_states, selected_experts.int())
+
+        tokens_per_expert = torch.histc(
+            selected_experts.float(), bins=self.num_experts, min=0, max=self.num_experts
+        ).long()
+        split_sizes = tokens_per_expert.tolist()
+
+        input_list = list(torch.split(permuted_hidden_states, split_sizes, dim=0))
+
+        gate_weights = [e.gate_proj.weight.t() for e in self.experts]
+        up_weights = [e.up_proj.weight.t() for e in self.experts]
+        down_weights = [e.down_proj.weight.t() for e in self.experts]
+
+        gate_out_tuple = HybridGmmFunction.apply(len(input_list), *input_list, *gate_weights)
+        up_out_tuple = HybridGmmFunction.apply(len(input_list), *input_list, *up_weights)
+
+        inter_list = [F.silu(g) * u for g, u in zip(gate_out_tuple, up_out_tuple)]
+
+        down_out_tuple = HybridGmmFunction.apply(len(inter_list), *inter_list, *down_weights)
+
+        grouped_output = torch.cat(down_out_tuple, dim=0)
+
+        next_states = torch_npu.npu_moe_token_unpermute(grouped_output, row_ids_map, probs=routing_weights)
+
+        next_states = next_states.view(batch_size, sequence_length, -1)
+        return next_states, router_logits
+
+
+# moe patch config mapping
+kernel_moe_mapping = {
+    "Qwen3VLMoeForConditionalGeneration": {
+        "Qwen3VLMoeTextExperts": NpuMoeFused.npu_moe_experts_forward,
+        "Qwen3VLMoeTextSparseMoeBlock": NpuMoeFused.npu_moe_sparse_block_forward,
+    }
+}
+
+if not is_transformers_version_greater_than("5.0.0"):
+    kernel_moe_mapping["Qwen3MoeForCausalLM"] = {
+        "Qwen3MoeSparseMoeBlock": Qwen3NpuMoeFused.qwen3moe_sparse_moe_block_forward
+    }
+
+
+@register_kernel
+class NpuFusedMoEKernel(BaseKernel):
+    """NPU Fused MoE Kernel implementation."""
+
+    _kernel_id = "npu_fused_moe"
+    _device = DeviceType.NPU
+
+    @classmethod
+    def apply(cls, **kwargs) -> HFModel:
+        """Applies the NPU fused MoE kernel to the model.
+
+        Args:
+            **kwargs: Keyword arguments containing the model.
+
+        Returns:
+            HFModel: The model with patched MoE forward functions.
+
+        Raises:
+            ValueError: If the model is not provided.
+            RuntimeError: If dependencies are not met.
+        """
+        model = kwargs.get("model", None)
+        if model is None:
+            raise ValueError(f"HFModel instance is required for {cls.__name__}.")
+
+        if not cls.check_deps():
+            raise RuntimeError("torch_npu is not available but NpuMoEFusedMoEKernel was called.")
+
+        archs = getattr(model.config, "architectures", None) or []
+        target_moe_mapping = None
+        for arch in archs:
+            if arch in kernel_moe_mapping:
+                target_moe_mapping = kernel_moe_mapping[arch]
+                break
+
+        if target_moe_mapping is None:
+            return model
+
+        for module in model.modules():
+            class_name = module.__class__.__name__
+            if class_name in target_moe_mapping:
+                new_forward_func = target_moe_mapping[class_name]
+                module.forward = types.MethodType(new_forward_func, module)
+
+        return model
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/mlp/npu_swiglu.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/mlp/npu_swiglu.py
new file mode 100644
index 0000000000000000000000000000000000000000..a45077bc0049b97d3e9b4ae7e3cf3f28975f4ca6
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/mlp/npu_swiglu.py
@@ -0,0 +1,168 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The definition of NPU fused SwiGLU kernels.
+
+Init Phase:
+1. Define SwiGLU forward functions.
+2. Register NPU fused SwiGLU kernel.
+
+"""
+
+import re
+import types
+
+import torch
+
+from ......accelerator.helper import DeviceType
+from ......utils.types import HFModel
+from ...base import BaseKernel
+from ...registry import register_kernel
+
+
+try:
+    import torch_npu
+except ImportError:
+    pass
+
+
+def npu_swiglu_forward(self, hidden_state):
+    """SwiGLU forward pass for NPU.
+
+    Args:
+        self: The MLP layer instance.
+        hidden_state (Tensor): Input hidden state.
+
+    Returns:
+        Tensor: Output of SwiGLU.
+    """
+    return self.down_proj(
+        torch_npu.npu_swiglu(torch.cat((self.gate_proj(hidden_state), self.up_proj(hidden_state)), dim=-1), dim=-1)
+    )
+
+
+def _npu_swiglu_glm4_forward(self, hidden_states):
+    """SwiGLU forward pass for GLM4 on NPU.
+
+    Args:
+        self: The GLM4 MLP layer instance.
+        hidden_states (Tensor): Input hidden states.
+
+    Returns:
+        Tensor: Output of SwiGLU.
+    """
+    up_states = self.gate_up_proj(hidden_states)
+    gate, up_states = up_states.chunk(2, dim=-1)
+    return self.down_proj(torch_npu.npu_swiglu(torch.cat((gate, up_states), dim=-1), dim=-1))
+
+
+def _npu_swiglu_gemma3ntext_forward(self, hidden_states):
+    """SwiGLU forward pass for Gemma3nText on NPU.
+
+    Args:
+        self: The Gemma3nText MLP layer instance.
+        hidden_states (Tensor): Input hidden states.
+
+    Returns:
+        Tensor: Output of SwiGLU.
+    """
+    gate_proj = self.gate_proj(hidden_states)
+    if self.activation_sparsity > 0.0:
+        gate_proj = self._gaussian_topk(gate_proj)
+    down_proj = self.down_proj(
+        torch_npu.npu_swiglu(torch.cat((gate_proj, self.up_proj(hidden_states)), dim=-1), dim=-1)
+    )
+    return down_proj
+
+
+@register_kernel
+class NpuSwiGluKernel(BaseKernel):
+    """NPU Kernel for fused SwiGLU activation."""
+
+    # just support apply to the following module layers
+    expect_modules = frozenset(
+        {
+            "Qwen3VLMoeTextMLP",
+            "Qwen3VLTextMLP",
+            "Qwen3OmniMoeThinkerTextMLP",
+            "Qwen3OmniMoeMLP",
+            "Qwen3OmniMoeTalkerTextMLP",
+            "Qwen3OmniMoeCode2WavMlp",
+            "Qwen3NextMLP",
+            "Qwen3MoeMLP",
+            "Qwen3MLP",
+            "Qwen2MLP",
+            "Qwen2MoeMLP",
+            "Qwen2_5_VLMLP",
+            "Qwen2_5OmniMLP",
+            "Llama4TextMLP",
+            "LlamaMLP",
+            "Glm4MLP",
+            "Glm4MoeMLP",
+            "Glm4vMoeTextMLP",
+            "Gemma3MLP",
+            "Gemma2MLP",
+            "Gemma3nTextMLP",
+            "Phi3MLP",
+            "DeepseekV2MLP",
+            "DeepseekV3MLP",
+            "SeedOssMLP",
+        }
+    )
+
+    _kernel_id = "npu_fused_swiglu"
+    _device = DeviceType.NPU
+
+    @classmethod
+    def apply(cls, **kwargs) -> "HFModel":
+        """Applies the NPU fused SwiGLU kernel to the model.
+
+        Args:
+            **kwargs: Keyword arguments containing the model.
+
+        Returns:
+            HFModel: The model with patched SwiGLU forward functions.
+
+        Raises:
+            ValueError: If the model is not provided.
+            RuntimeError: If dependencies are not met.
+        """
+        model = kwargs.get("model", None)
+        if model is None:
+            raise ValueError(f"HFModel instance is required for {cls.__name__}.")
+
+        if not cls.check_deps():
+            raise RuntimeError("torch_npu is not available but NpuSwiGluKernel was called.")
+
+        # Mapping of specific mlp modules to their corresponding kernel implementations
+        kernel_mapping = {
+            "Glm4MLP": _npu_swiglu_glm4_forward,
+            "Glm4vTextMLP": _npu_swiglu_glm4_forward,
+            "Phi3MLP": _npu_swiglu_glm4_forward,
+            "Gemma3nTextMLP": _npu_swiglu_gemma3ntext_forward,
+        }
+
+        swiglu_pattern = re.compile("MLP", re.IGNORECASE)
+        for name, module in model.named_modules():
+            # Match any module whose class name contains "MLP"
+            if (
+                re.search(swiglu_pattern, module.__class__.__name__)
+                and module.__class__.__name__ in cls.expect_modules
+            ):
+                # Bind function as an instance method to preserve `self` semantics
+                # and replace the original forward
+                kernel_func = kernel_mapping.get(module.__class__.__name__, npu_swiglu_forward)
+                module.forward = types.MethodType(kernel_func, module)
+
+        return model
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/rms_norm/__init__.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/rms_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/rms_norm/npu_rms_norm.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/rms_norm/npu_rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..35057f451c9445a058ef9318696c9541d99cbfc1
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/rms_norm/npu_rms_norm.py
@@ -0,0 +1,91 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The definition of NPU fused RMSNorm kernels.
+
+Init Phase:
+1. Define RMSNorm forward function.
+2. Register NPU fused RMSNorm kernel.
+
+"""
+
+import re
+import types
+
+from ......accelerator.helper import DeviceType
+from ......utils.types import HFModel
+from ...base import BaseKernel
+from ...registry import register_kernel
+
+
+def npu_rms_norm_forward(self, hidden_states):
+    """NPU forward implementation for RMSNorm.
+
+    Args:
+        self: RMSNorm module instance with `weight` and `variance_epsilon`.
+        hidden_states (Tensor): Input hidden states tensor, same shape as the baseline.
+
+    Returns:
+        Tensor: Normalized tensor consistent with the baseline RMSNorm behavior.
+    """
+    import torch_npu
+
+    return torch_npu.npu_rms_norm(hidden_states, self.weight, epsilon=self.variance_epsilon)[0]
+
+
+@register_kernel
+class NpuRMSNormKernel(BaseKernel):
+    """NPU kernel wrapper for RMSNorm that applies the replacement within a model."""
+
+    _kernel_id = "npu_fused_rmsnorm"
+    _device = DeviceType.NPU
+
+    @classmethod
+    def apply(cls, **kwargs) -> "HFModel":
+        """Iterate the model and apply NPU-optimized forward to matched RMSNorm modules.
+
+        Key points:
+        - Match modules whose class name contains "RMSNorm" (case-insensitive).
+        - Bind `_npu_rms_forward` as an instance method via `types.MethodType` to
+          replace the original `forward`.
+        - Do not modify weights, hyperparameters, or module structure to ensure
+          numerical behavior and interface consistency.
+
+        Args:
+            **kwargs: Keyword arguments containing the model.
+
+        Returns:
+            HFModel: The model with NPU fused RMSNorm.
+
+        Raises:
+            RuntimeError: If torch_npu is not available.
+            ValueError: If the model is not provided.
+        """
+        model = kwargs.get("model")
+        if model is None:
+            raise ValueError(f"HFModel instance is required for {cls.__name__}.")
+
+        if not cls.check_deps():
+            raise RuntimeError(f"torch_npu is not available but {cls.__name__} was called.")
+
+        rms_norm_pattern = re.compile("RMSNorm", re.IGNORECASE)
+
+        for name, module in model.named_modules():
+            # Match any module whose class name contains "RMSNorm"
+            if re.search(rms_norm_pattern, module.__class__.__name__):
+                # Bind function as an instance method to preserve `self` semantics
+                # and replace the original forward
+                module.forward = types.MethodType(npu_rms_norm_forward, module)
+
+        return model
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/rope/__init__.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/rope/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/rope/npu_rope.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/rope/npu_rope.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f3e290a00e0dae30eade09f09c1b84a372e222b
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/ops/rope/npu_rope.py
@@ -0,0 +1,149 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The definition of NPU fused RoPE kernels.
+
+Init Phase:
+1. Define RoPE forward functions.
+2. Register NPU fused RoPE kernel.
+
+"""
+
+import sys
+
+import torch
+
+from ......accelerator.helper import DeviceType
+from ......utils.logging import get_logger
+from ......utils.types import HFModel
+from ...base import BaseKernel
+from ...registry import register_kernel
+
+
+logger = get_logger(__name__)
+
+try:
+    import torch_npu
+except ImportError:
+    pass
+
+
+def _apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors using NPU optimization.
+
+    Args:
+        q (Tensor): Query tensor.
+        k (Tensor): Key tensor.
+        cos (Tensor): Cosine part of embedding.
+        sin (Tensor): Sine part of embedding.
+        position_ids (Tensor, optional): Position IDs. Default: ``None``.
+        unsqueeze_dim (int): Dimension to unsqueeze cos and sin. Default: 1.
+
+    Returns:
+        tuple: (q_embed, k_embed) The embedded query and key tensors.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = torch_npu.npu_rotary_mul(q, cos, sin)
+    k_embed = torch_npu.npu_rotary_mul(k, cos, sin)
+    return q_embed, k_embed
+
+
+def _apply_multimodal_rotary_pos_emb_qwen25_vl(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding with multimodal sections (Qwen2-VL) on NPU.
+
+    Args:
+        q (Tensor): Query tensor.
+        k (Tensor): Key tensor.
+        cos (Tensor): Cosine part of embedding.
+        sin (Tensor): Sine part of embedding.
+        mrope_section (Tensor): Multimodal RoPE section.
+        unsqueeze_dim (int): Dimension to unsqueeze cos and sin. Default: 1.
+
+    Returns:
+        tuple: (q_embed, k_embed) The embedded query and key tensors.
+    """
+    mrope_section = mrope_section * 2
+    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+
+    q_embed = torch_npu.npu_rotary_mul(q, cos, sin)
+    k_embed = torch_npu.npu_rotary_mul(k, cos, sin)
+    return q_embed, k_embed
+
+
+@register_kernel
+class NpuRoPEKernel(BaseKernel):
+    """NPU Kernel for Rotary Position Embedding."""
+
+    _kernel_id = "npu_fused_rope"
+    _device = DeviceType.NPU
+
+    @classmethod
+    def apply(cls, **kwargs) -> "HFModel":
+        """Apply RoPE acceleration by monkey-patching `apply_rotary_pos_emb`.
+
+        This function iterates through the model's modules to find attention layers,
+        identifies the module where they are defined, and replaces the original
+        `apply_rotary_pos_emb` function in that module's namespace with the
+        NPU-accelerated version from this file.
+
+        Args:
+            **kwargs: Keyword arguments containing the model.
+
+        Returns:
+            HFModel: The model with patched RoPE functions.
+
+        Raises:
+            RuntimeError: If dependencies are not met.
+            ValueError: If the model is not provided.
+        """
+        if not cls.check_deps():
+            raise RuntimeError(f"torch_npu is not available but {cls.__name__} was called.")
+
+        model = kwargs.get("model", None)
+        if model is None:
+            raise ValueError(f"HFModel instance is required for {cls.__name__}.")
+
+        _modules = set()
+        for module in model.modules():
+            if "Attention" in module.__class__.__name__:
+                module_name = module.__class__.__module__
+                if module_name in _modules:
+                    continue
+                try:
+                    target_module = sys.modules[module_name]
+                    if hasattr(target_module, "apply_rotary_pos_emb"):
+                        if getattr(target_module, "apply_rotary_pos_emb") is not _apply_rotary_pos_emb:
+                            setattr(target_module, "apply_rotary_pos_emb", _apply_rotary_pos_emb)
+                            _modules.add(module_name)
+                    if hasattr(target_module, "apply_multimodal_rotary_pos_emb"):
+                        if (
+                            getattr(target_module, "apply_multimodal_rotary_pos_emb")
+                            is not _apply_multimodal_rotary_pos_emb_qwen25_vl
+                        ):
+                            setattr(
+                                target_module,
+                                "apply_multimodal_rotary_pos_emb",
+                                _apply_multimodal_rotary_pos_emb_qwen25_vl,
+                            )
+                            _modules.add(module_name)
+                except Exception as e:
+                    logger.warning_rank0_once(f"Failed to apply RoPE kernel to module {module_name}: {e}")
+
+        return model
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/registry.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..2621e4bad4ecbc523abb96585d7cdb15290898ee
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/kernels/registry.py
@@ -0,0 +1,96 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The definition of kernel registry.
+
+Init Phase:
+1. Define kernel registry.
+2. Register kernels.
+
+"""
+
+from ....accelerator.helper import get_current_accelerator
+from .base import BaseKernel
+
+
+__all__ = ["Registry", "register_kernel"]
+
+
+class Registry:
+    """Registry for managing kernel implementations.
+
+    Storage structure: ``{ "kernel_id": Class }``
+    """
+
+    _kernels: dict[str, type[BaseKernel]] = {}
+
+    @classmethod
+    def register(cls, kernel_cls: type[BaseKernel]) -> type[BaseKernel] | None:
+        """Decorator to register a kernel class.
+
+        The class must inherit from :class:`BaseKernel` and specify ``_kernel_id`` and ``_device`` attributes.
+
+        Args:
+            kernel_cls (type[BaseKernel]): The kernel class to register.
+
+        Returns:
+            type[BaseKernel] | None: The registered kernel class if the device type matches the current accelerator
+
+        Raises:
+            TypeError: If the class does not inherit from :class:`BaseKernel`.
+            ValueError: If the kernel ID is missing or already registered.
+        """
+        if not issubclass(kernel_cls, BaseKernel):
+            raise TypeError(f"Class {kernel_cls} must inherit from BaseKernel")
+
+        kernel_id = kernel_cls.get_kernel_id()
+        device = kernel_cls.get_device()
+
+        # The device type of the current accelerator does not match the device type required by the kernel, skip registration
+        if device != get_current_accelerator().type:
+            return
+
+        if not kernel_id:
+            raise ValueError(f"Kernel ID (_kernel_id) is needed for {kernel_cls} to register")
+
+        if kernel_id in cls._kernels:
+            raise ValueError(f"{kernel_id} already registered! The registered kernel is {cls._kernels[kernel_id]}")
+
+        cls._kernels[kernel_id] = kernel_cls
+        return kernel_cls
+
+    @classmethod
+    def get(cls, kernel_id: str) -> type[BaseKernel] | None:
+        """Retrieves a registered kernel implementation by its ID.
+
+        Args:
+            kernel_id (str): The ID of the kernel to retrieve.
+
+        Returns:
+            type[BaseKernel] | None: The kernel class if found, else ``None``.
+        """
+        return cls._kernels.get(kernel_id)
+
+    @classmethod
+    def get_registered_kernels(cls) -> dict[str, type[BaseKernel]]:
+        """Returns a dictionary of all registered kernels.
+
+        Returns:
+            dict[str, type[BaseKernel]]: Dictionary mapping kernel IDs to kernel classes.
+        """
+        return cls._kernels
+
+
+# export decorator alias
+register_kernel = Registry.register
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/peft.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/peft.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3f37482c74e6b20fb628d80b5a70c2878ce50ff
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/peft.py
@@ -0,0 +1,57 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Literal, TypedDict
+
+from peft import LoraConfig, PeftModel, get_peft_model
+
+from ...utils.plugin import BasePlugin
+from ...utils.types import HFModel
+
+
+class LoraConfigDict(TypedDict, total=False):
+    name: Literal["lora"]
+    """Plugin name."""
+    r: int
+    """Lora rank."""
+    lora_alpha: int
+    """Lora alpha."""
+    target_modules: list[str]
+    """Target modules."""
+
+
+class FreezeConfigDict(TypedDict, total=False):
+    name: Literal["freeze"]
+    """Plugin name."""
+    freeze_trainable_layers: int
+    """Freeze trainable layers."""
+    freeze_trainable_modules: list[str] | None
+    """Freeze trainable modules."""
+
+
+class PeftPlugin(BasePlugin):
+    def __call__(self, model: HFModel, config: dict, is_train: bool) -> HFModel:
+        return super().__call__(model, config)
+
+
+@PeftPlugin("lora").register()
+def get_lora_model(model: HFModel, config: LoraConfigDict, is_train: bool) -> PeftModel:
+    peft_config = LoraConfig(**config)
+    model = get_peft_model(model, peft_config)
+    return model
+
+
+@PeftPlugin("freeze").register()
+def get_freeze_model(model: HFModel, config: FreezeConfigDict, is_train: bool) -> HFModel:
+    raise NotImplementedError()
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/quantization.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/rendering.py b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/rendering.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ca8b43fc101578377f40c8aa67a8142d58fc023
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/model_plugins/rendering.py
@@ -0,0 +1,235 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+
+from ...utils.constants import IGNORE_INDEX
+from ...utils.helper import get_tokenizer
+from ...utils.plugin import BasePlugin
+from ...utils.types import Message, ModelInput, Processor, ToolCall
+
+
+class RenderingPlugin(BasePlugin):
+    def render_messages(
+        self,
+        processor: Processor,
+        messages: list[Message],
+        tools: str | None = None,
+        is_generate: bool = False,
+    ) -> ModelInput:
+        """Render messages in the template format."""
+        return self["render_messages"](processor, messages, tools, is_generate)
+
+    def parse_messages(self, generated_text: str) -> Message:
+        """Parse messages in the template format."""
+        return self["parse_messages"](generated_text)
+
+
+def _update_model_input(
+    processor: Processor,
+    input_ids: list[int],
+    labels: list[int],
+    loss_weights: list[int],
+    temp_str: str,
+    temp_weight: float,
+) -> str:
+    """Update model input with temporary string."""
+    if not temp_str:
+        return ""
+
+    tokenizer = get_tokenizer(processor)
+    temp_ids = tokenizer.encode(temp_str, add_special_tokens=False)
+    input_ids.extend(temp_ids)
+    loss_weights.extend([temp_weight] * len(temp_ids))
+    if temp_weight > 1e-6:
+        labels.extend(temp_ids)
+    else:
+        labels.extend([IGNORE_INDEX] * len(temp_ids))
+
+    return ""
+
+
+@RenderingPlugin("qwen3_nothink").register("render_messages")
+def render_qwen3_nothink_messages(
+    processor: Processor,
+    messages: list[Message],
+    tools: str | None = None,
+    is_generate: bool = False,
+) -> ModelInput:
+    """Render messages in the Qwen3 nothink template format.
+
+    See https://huggingface.co/spaces/huggingfacejs/chat-template-playground?modelId=Qwen/Qwen3-4B-Instruct-2507
+    """
+    input_ids, labels, loss_weights = [], [], []
+    temp_str, temp_weight = "", 0.0
+    if tools:
+        temp_str += "<|im_start|>system\n"
+        if messages[0]["role"] == "system":
+            for content in messages[0]["content"]:
+                if content["type"] == "text":
+                    temp_str += content["value"]
+                else:
+                    raise ValueError(f"Unsupported content type: {content['type']}")
+
+            temp_str += "\n\n"
+            temp_weight = messages[0].get("loss_weight", 0.0)
+
+        temp_str += (
+            "# Tools\n\nYou may call one or more functions to assist with the user query.\n\n"
+            "You are provided with function signatures within <tools></tools> XML tags:\n<tools>"
+        )
+        try:
+            tools = json.loads(tools)
+        except json.JSONDecodeError:
+            raise ValueError(f"Invalid tools format: {str(tools)}.")
+
+        if not isinstance(tools, list):
+            tools = [tools]
+
+        for tool in tools:
+            temp_str += "\n" + json.dumps(tool, ensure_ascii=False)
+
+        temp_str += (
+            "\n</tools>\n\nFor each function call, return a json object with function name "
+            'and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{"name": '
+            '<function-name>, "arguments": <args-json-object>}\n</tool_call><|im_end|>\n'
+        )
+    elif messages[0]["role"] == "system":
+        temp_str += "<|im_start|>system\n"
+        for content in messages[0]["content"]:
+            if content["type"] == "text":
+                temp_str += content["value"]
+            else:
+                raise ValueError(f"Unsupported content type: {content['type']}")
+
+        temp_str += "<|im_end|>\n"
+        temp_weight = messages[0].get("loss_weight", 0.0)
+
+    temp_str = _update_model_input(processor, input_ids, labels, loss_weights, temp_str, temp_weight)
+
+    for turn_idx, message in enumerate(messages):
+        if message["role"] == "user" or (message["role"] == "system" and turn_idx != 0):
+            temp_str += "<|im_start|>" + message["role"] + "\n"
+            for content in message["content"]:
+                if content["type"] == "text":
+                    temp_str += content["value"]
+                else:
+                    raise ValueError(f"Unsupported content type: {content['type']}")
+
+            temp_str += "<|im_end|>\n"
+            temp_weight = message.get("loss_weight", 0.0)
+        elif message["role"] == "assistant":
+            temp_str += "<|im_start|>" + message["role"] + "\n"
+            for val_idx, content in enumerate(message["content"]):
+                if content["type"] == "text":
+                    temp_str += content["value"]
+                elif content["type"] == "reasoning":
+                    temp_str += "<thinking>\n" + content["value"] + "\n</thinking>\n\n"  # avoid using special tokens
+                elif content["type"] == "tool_call":
+                    if val_idx != 0 and message["content"][val_idx - 1]["type"] in ["text", "tool_call"]:
+                        temp_str += "\n"
+
+                    try:
+                        tool_call: ToolCall = json.loads(content["value"])
+                    except json.JSONDecodeError:
+                        raise ValueError(f"Invalid tool call format: {content['value']}.")
+
+                    temp_str += (
+                        '<tool_call>\n{"name": "'
+                        + tool_call["name"]
+                        + '", "arguments": '
+                        + json.dumps(tool_call["arguments"], ensure_ascii=False)
+                        + "}\n</tool_call>"
+                    )
+
+                else:
+                    raise ValueError(f"Unsupported content type: {content['type']}")
+
+            temp_str += "<|im_end|>\n"
+            temp_weight = message.get("loss_weight", 1.0)
+        elif message["role"] == "tool":
+            if turn_idx == 0 or messages[turn_idx - 1]["role"] != "tool":
+                temp_str += "<|im_start|>user"
+
+            temp_str += "\n<tool_response>\n"
+            for content in message["content"]:
+                if content["type"] == "text":
+                    temp_str += content["value"]
+                else:
+                    raise ValueError(f"Unsupported content type: {content['type']}")
+
+            temp_str += "\n</tool_response>"
+            if turn_idx == len(messages) - 1 or messages[turn_idx + 1]["role"] != "tool":
+                temp_str += "<|im_end|>\n"
+
+            temp_weight = message.get("loss_weight", 0.0)
+
+        temp_str = _update_model_input(processor, input_ids, labels, loss_weights, temp_str, temp_weight)
+
+    if is_generate:
+        temp_str += "<|im_start|>assistant\n"
+        temp_weight = 0.0
+
+    temp_str = _update_model_input(processor, input_ids, labels, loss_weights, temp_str, temp_weight)
+
+    attention_mask = [1] * len(input_ids)
+    return ModelInput(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        labels=labels,
+        loss_weights=loss_weights,
+    )
+
+
+@RenderingPlugin("qwen3_nothink").register("parse_message")
+def parse_qwen3_nothink_message(generated_text: str) -> Message:
+    """Parse a message in the Qwen3 nothink template format. Supports interleaved reasoning and tool calls.
+
+    Args:
+        generated_text (str): The generated text in the Qwen3 nothink template format.
+
+    Returns:
+        Message: The parsed message.
+    """
+    pattern = re.compile(r"<(thinking|tool_call)>\s*(.*?)\s*</\1>\s*", re.DOTALL)
+    content = []
+    last_end = 0
+    for match in pattern.finditer(generated_text):
+        start, end = match.span()
+        if start > last_end:
+            text = generated_text[last_end:start].strip()
+            if text:
+                content.append({"type": "text", "value": text})
+
+        tag_type = match.group(1)
+        tag_value = match.group(2).strip()
+        if tag_type == "thinking":
+            content.append({"type": "reasoning", "value": tag_value.strip()})
+        elif tag_type == "tool_call":
+            try:
+                json.loads(tag_value.strip())
+            except json.JSONDecodeError:
+                raise ValueError(f"Invalid tool call format: {tag_value.strip()}.")
+
+            content.append({"type": "tool_call", "value": tag_value.strip()})
+
+        last_end = end
+
+    if last_end < len(generated_text):
+        text = generated_text[last_end:].strip()
+        if text:
+            content.append({"type": "text", "value": text})
+
+    return Message(role="assistant", content=content)
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/sampler_plugins/__init__.py b/LlamaFactory/src/llamafactory/v1/plugins/sampler_plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/sampler_plugins/vllm.py b/LlamaFactory/src/llamafactory/v1/plugins/sampler_plugins/vllm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/__init__.py b/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/batching.py b/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/batching.py
new file mode 100644
index 0000000000000000000000000000000000000000..aef22eac2ef2b43bf7aa587c96f031000815cc94
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/batching.py
@@ -0,0 +1,34 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.objects import StatefulBuffer
+from ...utils.plugin import BasePlugin
+from ...utils.types import BatchInfo, BatchInput, DataLoader
+
+
+class BatchingPlugin(BasePlugin):
+    def compute_length(self, data_provider: DataLoader) -> int:
+        """Compute the length of the batch generator.
+
+        The approximate length is used to calculate the lr schedule.
+        """
+        raise NotImplementedError()
+
+    def fill_buffer(self, buffer: StatefulBuffer, batch_info: BatchInfo) -> None:
+        """Fill the buffer with data."""
+        raise NotImplementedError()
+
+    def generate_batch(self, buffer: StatefulBuffer, batch_info: BatchInfo) -> list[BatchInput] | None:
+        """Generate a batch from the buffer."""
+        raise NotImplementedError()
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/distributed/__init__.py b/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/distributed/deepspeed.py b/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/distributed/deepspeed.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/distributed/fsdp2.py b/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/distributed/fsdp2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b40265ce36c28f2997abaf635de7b319b17c6c82
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/distributed/fsdp2.py
@@ -0,0 +1,399 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+
+import torch
+import torch.nn as nn
+from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict, set_model_state_dict
+from torch.distributed.fsdp import (
+    CPUOffloadPolicy,
+    MixedPrecisionPolicy,
+    fully_shard,
+)
+from transformers import PreTrainedModel
+
+from ....accelerator.helper import get_current_accelerator
+from ....accelerator.interface import DistributedInterface
+from ....utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def get_transformer_layer_cls(model: PreTrainedModel) -> type[nn.Module] | None:
+    no_split_modules = getattr(model, "_no_split_modules", None)
+    if no_split_modules:
+        if isinstance(no_split_modules, (list, tuple)):
+            for name, module in model.named_modules():
+                for cls_name in no_split_modules:
+                    if module.__class__.__name__ == cls_name:
+                        return module.__class__
+    if hasattr(model, "model") and hasattr(model.model, "layers"):
+        return type(model.model.layers[0])
+    if hasattr(model, "layers"):
+        return type(model.layers[0])
+
+    return None
+
+
+class FSDP2Engine:
+    def __init__(self, dist_config: dict):
+        self.dist_interface = DistributedInterface()
+        self.rank = self.dist_interface.get_rank()
+        self.local_rank = self.dist_interface.get_local_rank()
+        self.world_size = self.dist_interface.get_world_size()
+        self.mixed_precision = dist_config.get("mixed_precision", "bf16")
+        self.reshard_after_forward = dist_config.get("reshard_after_forward", True)
+        self.offload_params = dist_config.get("offload_params", False)
+        self.pin_memory = dist_config.get("pin_memory", True)
+        self.dcp_path = dist_config.get("dcp_path", None)
+        self.device_mesh = self.dist_interface.data_device_mesh
+
+        if self.device_mesh is None:
+            logger.warning(
+                "Device Mesh not found in DistributedInterface. FSDP2 might fail if not running in distributed mode."
+            )
+
+        if self.device_mesh is not None:
+            try:
+                self.fsdp_mesh = self.device_mesh["dp"]
+            except Exception:
+                self.fsdp_mesh = self.device_mesh
+
+            logger.info(f"Using Device Mesh: {self.fsdp_mesh}")
+        else:
+            self.fsdp_mesh = None
+
+    def get_mp_policy(self) -> MixedPrecisionPolicy:
+        if self.mixed_precision == "bf16":
+            param_dtype = torch.bfloat16
+            reduce_dtype = torch.float32
+        elif self.mixed_precision == "fp16":
+            param_dtype = torch.float16
+            reduce_dtype = torch.float32
+        else:
+            param_dtype = torch.float32
+            reduce_dtype = torch.float32
+
+        return MixedPrecisionPolicy(
+            param_dtype=param_dtype,
+            reduce_dtype=reduce_dtype,
+            cast_forward_inputs=True,
+        )
+
+    def prepare_model(self, model: PreTrainedModel) -> PreTrainedModel:
+        if self.fsdp_mesh is None:
+            logger.warning("No FSDP Mesh available, skipping FSDP wrapping.")
+            return model
+
+        mp_policy = self.get_mp_policy()
+        layer_cls = get_transformer_layer_cls(model)
+
+        if layer_cls is None:
+            logger.warning(
+                "Could not identify Transformer Layer class, applying FSDP to the whole model structure only."
+            )
+            transformer_layer_cls_to_wrap = set()
+        else:
+            logger.info(f"Applying per-layer FSDP to {layer_cls.__name__}")
+            transformer_layer_cls_to_wrap = {layer_cls}
+
+        for name, module in model.named_modules():
+            should_wrap = False
+
+            if type(module) in transformer_layer_cls_to_wrap:
+                should_wrap = True
+            elif isinstance(module, nn.Embedding):
+                if not getattr(model.config, "tie_word_embeddings", True):
+                    should_wrap = True
+
+            if should_wrap:
+                fully_shard(
+                    module,
+                    mesh=self.fsdp_mesh,
+                    reshard_after_forward=self.reshard_after_forward,
+                    mp_policy=mp_policy,
+                    offload_policy=CPUOffloadPolicy(pin_memory=self.pin_memory) if self.offload_params else None,
+                )
+
+        use_gradient_checkpointing = True  # Could be configurable
+        if use_gradient_checkpointing:
+            if self.rank == 0:
+                logger.info("Enabling gradient checkpointing (transformers native)...")
+
+            model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
+
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+        fully_shard(
+            model,
+            mesh=self.fsdp_mesh,
+            reshard_after_forward=self.reshard_after_forward,
+            mp_policy=mp_policy,
+            offload_policy=CPUOffloadPolicy(pin_memory=self.pin_memory) if self.offload_params else None,
+        )
+
+        return model
+
+    @torch.no_grad()
+    def materialize_and_load(self, model: PreTrainedModel, hf_model_path: str, dcp_path: str = None):
+        if self.rank == 0:
+            logger.info("Materializing sharded model params...")
+
+        device = get_current_accelerator()
+        model.to_empty(device=device)
+
+        if dcp_path and os.path.exists(dcp_path):
+            if self.rank == 0:
+                logger.info(f"DCP path found at {dcp_path}. Using efficient Sharded Loading (DCP Load).")
+            self._load_from_dcp(model, dcp_path)
+        else:
+            if self.rank == 0:
+                if dcp_path:
+                    logger.warning(f"DCP path {dcp_path} not found.")
+                logger.info("Using HF Meta Loading (Chunk Load).")
+            self._load_weights_from_hf_checkpoint(model, hf_model_path)
+
+        return model
+
+    def shard_model(self, model: PreTrainedModel) -> PreTrainedModel:
+        if model.device.type == "meta":
+            model = self.prepare_model(model)
+            model = self.materialize_and_load(model, hf_model_path=model.config.name_or_path, dcp_path=self.dcp_path)
+        else:
+            model = self.prepare_model(model)
+        return model
+
+    def _load_from_dcp(self, model: PreTrainedModel, dcp_path: str):
+        import torch.distributed.checkpoint as dcp
+
+        try:
+            if self.rank == 0:
+                logger.info(f"Loading distributed checkpoint from {dcp_path} ...")
+
+            options = StateDictOptions(full_state_dict=False, cpu_offload=True)
+            local_state_dict = get_model_state_dict(model, options=options)
+            dcp.load(state_dict=local_state_dict, checkpoint_id=dcp_path)
+            set_model_state_dict(model, local_state_dict, options=options)
+
+            if self.rank == 0:
+                logger.info("DCP weights loaded successfully.")
+
+        except Exception as e:
+            logger.error(f"Failed to load from DCP: {e}")
+            raise e
+
+    def _load_weights_from_hf_checkpoint(self, model, hf_model_path):
+        import glob
+        import json
+
+        hf_model_path = self._resolve_hf_checkpoint_dir(hf_model_path)
+
+        if self.rank == 0:
+            logger.info(f"Loading weights from {hf_model_path} ...")
+
+        index_file = os.path.join(hf_model_path, "model.safetensors.index.json")
+        is_safetensors = True
+        checkpoint_files = []
+
+        if os.path.exists(index_file):
+            with open(index_file) as f:
+                index = json.load(f)
+            checkpoint_files = sorted(set(index["weight_map"].values()))
+            checkpoint_files = [os.path.join(hf_model_path, f) for f in checkpoint_files]
+        elif os.path.exists(os.path.join(hf_model_path, "model.safetensors")):
+            checkpoint_files = [os.path.join(hf_model_path, "model.safetensors")]
+        else:
+            is_safetensors = False
+            index_file = os.path.join(hf_model_path, "pytorch_model.bin.index.json")
+            if os.path.exists(index_file):
+                with open(index_file) as f:
+                    index = json.load(f)
+                checkpoint_files = sorted(set(index["weight_map"].values()))
+                checkpoint_files = [os.path.join(hf_model_path, f) for f in checkpoint_files]
+            elif os.path.exists(os.path.join(hf_model_path, "pytorch_model.bin")):
+                checkpoint_files = [os.path.join(hf_model_path, "pytorch_model.bin")]
+            else:
+                checkpoint_files = sorted(glob.glob(os.path.join(hf_model_path, "*.safetensors")))
+                if checkpoint_files:
+                    is_safetensors = True
+                else:
+                    checkpoint_files = sorted(glob.glob(os.path.join(hf_model_path, "*.bin")))
+
+        if not checkpoint_files:
+            raise ValueError(f"No checkpoint files found in {hf_model_path}")
+
+        param_map = dict(model.named_parameters())
+        total_files = len(checkpoint_files)
+
+        for i, ckpt_file in enumerate(checkpoint_files):
+            if self.rank == 0:
+                logger.info(f"[{i + 1}/{total_files}] Loading {os.path.basename(ckpt_file)} ...")
+
+            if is_safetensors:
+                from safetensors import safe_open
+
+                with safe_open(ckpt_file, framework="pt", device="cpu") as f:
+                    for key in f.keys():
+                        if key in param_map:
+                            tensor = f.get_tensor(key)
+                            self._copy_weights(param_map[key], tensor)
+            else:
+                state_dict = torch.load(ckpt_file, map_location="cpu")
+                for key, tensor in state_dict.items():
+                    if key in param_map:
+                        self._copy_weights(param_map[key], tensor)
+                del state_dict
+                gc.collect()
+
+    def _resolve_hf_checkpoint_dir(self, hf_model_path: str) -> str:
+        """Resolve a HF model identifier or local path to a local directory containing checkpoint files.
+
+        - If `hf_model_path` is an existing directory, return it.
+        - If it's a file path, return its parent directory.
+        - Otherwise treat it as a Hugging Face Hub repo id and download/resolve to the local cache dir.
+        """
+        if not hf_model_path:
+            return hf_model_path
+
+        # Local directory or file path.
+        if os.path.isdir(hf_model_path):
+            return hf_model_path
+        if os.path.isfile(hf_model_path):
+            return os.path.dirname(hf_model_path)
+
+        # HuggingFace Hub repo id: snapshot to local cache so we can glob/index files.
+        try:
+            from huggingface_hub import snapshot_download
+        except ImportError as e:
+            raise ValueError(
+                f"hf_model_path='{hf_model_path}' does not exist locally and huggingface_hub is not available "
+                f"to download it. Please provide a local model directory or install huggingface_hub. Error: {e}"
+            ) from e
+
+        revision = os.getenv("HF_REVISION")
+        offline = os.getenv("HF_HUB_OFFLINE") == "1" or os.getenv("TRANSFORMERS_OFFLINE") == "1"
+
+        # In distributed runs, let rank0 download first to avoid N-way concurrent downloads.
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            if self.rank == 0:
+                local_dir = snapshot_download(
+                    repo_id=hf_model_path,
+                    revision=revision,
+                    local_files_only=offline,
+                    allow_patterns=[
+                        "*.safetensors",
+                        "*.bin",
+                        "*.index.json",
+                        "model.safetensors",
+                        "model.safetensors.index.json",
+                        "pytorch_model.bin",
+                        "pytorch_model.bin.index.json",
+                        "config.json",
+                    ],
+                )
+                logger.info(f"Resolved HF repo id '{hf_model_path}' to local dir: {local_dir}")
+            torch.distributed.barrier()
+            if self.rank != 0:
+                local_dir = snapshot_download(
+                    repo_id=hf_model_path,
+                    revision=revision,
+                    local_files_only=True,
+                    allow_patterns=[
+                        "*.safetensors",
+                        "*.bin",
+                        "*.index.json",
+                        "model.safetensors",
+                        "model.safetensors.index.json",
+                        "pytorch_model.bin",
+                        "pytorch_model.bin.index.json",
+                        "config.json",
+                    ],
+                )
+            return local_dir
+
+        local_dir = snapshot_download(
+            repo_id=hf_model_path,
+            revision=revision,
+            local_files_only=offline,
+            allow_patterns=[
+                "*.safetensors",
+                "*.bin",
+                "*.index.json",
+                "model.safetensors",
+                "model.safetensors.index.json",
+                "pytorch_model.bin",
+                "pytorch_model.bin.index.json",
+                "config.json",
+            ],
+        )
+        if self.rank == 0:
+            logger.info(f"Resolved HF repo id '{hf_model_path}' to local dir: {local_dir}")
+        return local_dir
+
+    def _copy_weights(self, param, loaded_tensor):
+        from torch.distributed._tensor import DTensor, Shard
+
+        if loaded_tensor.dtype != param.dtype:
+            loaded_tensor = loaded_tensor.to(param.dtype)
+
+        if isinstance(param, DTensor):
+            shard_placement = None
+            mesh_dim = -1
+
+            for i, placement in enumerate(param.placements):
+                if isinstance(placement, Shard):
+                    shard_placement = placement
+                    mesh_dim = i
+                    break
+
+            local_tensor = param.to_local()
+
+            if shard_placement is None:
+                local_tensor.copy_(loaded_tensor)
+            else:
+                dim = shard_placement.dim
+                mesh = param.device_mesh
+                my_coordinate = mesh.get_coordinate()
+                if my_coordinate is None:
+                    return
+
+                rank_in_dim = my_coordinate[mesh_dim]
+                world_size_in_dim = mesh.size(mesh_dim)
+
+                full_size = param.shape[dim]
+                chunk_size = (full_size + world_size_in_dim - 1) // world_size_in_dim
+
+                start = rank_in_dim * chunk_size
+                end = min(start + chunk_size, full_size)
+
+                if start >= full_size:
+                    return
+
+                sliced_tensor = loaded_tensor.narrow(dim, start, end - start)
+
+                slices = [slice(None)] * local_tensor.ndim
+                slices[dim] = slice(0, sliced_tensor.shape[dim])
+                local_tensor[tuple(slices)].copy_(sliced_tensor)
+        else:
+            param.data.copy_(loaded_tensor)
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/distributed/hub.py b/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/distributed/hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..096cae14ee1ba0fac327416efcc95754595b5680
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/distributed/hub.py
@@ -0,0 +1,34 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....config.arg_utils import PluginConfig
+from ....utils.plugin import BasePlugin
+from ....utils.types import HFModel
+
+
+class DistributedPlugin(BasePlugin):
+    def __call__(self, model: HFModel, dist_config: PluginConfig, **kwargs) -> HFModel:
+        return super().__call__(model, dist_config, **kwargs)
+
+
+@DistributedPlugin("fsdp2").register()
+def shard_model_fsdp2(model: HFModel, dist_config: PluginConfig) -> HFModel:
+    from .fsdp2 import FSDP2Engine
+
+    return FSDP2Engine(dist_config).shard_model(model)
+
+
+@DistributedPlugin("deepspeed").register()
+def shard_model_deepspeed(model: HFModel, dist_config: PluginConfig) -> HFModel:
+    return model
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/lr_scheduler.py b/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..02c9e8b034a20870ed3b2e67f15e2ac45fe3ef1f
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/lr_scheduler.py
@@ -0,0 +1,19 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.plugin import BasePlugin
+
+
+class LRSchedulerPlugin(BasePlugin):
+    pass
diff --git a/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/optimizer.py b/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d040b0e29010118678befb74329fbb2de531a8c9
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/plugins/trainer_plugins/optimizer.py
@@ -0,0 +1,19 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.plugin import BasePlugin
+
+
+class OptimizerPlugin(BasePlugin):
+    pass
diff --git a/LlamaFactory/src/llamafactory/v1/samplers/cli_sampler.py b/LlamaFactory/src/llamafactory/v1/samplers/cli_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..40647165f84c5137e4ede401e5a2e370efce0061
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/samplers/cli_sampler.py
@@ -0,0 +1,125 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+from collections.abc import Generator
+from threading import Thread
+
+from ..config import InputArgument, ModelArguments, SampleArguments, SampleBackend, get_args
+from ..core.base_sampler import BaseSampler
+from ..core.data_engine import DataEngine
+from ..core.model_engine import ModelEngine
+from ..core.utils.rendering import Renderer
+from ..utils.types import HFModel, Message, Sample, TorchDataset
+
+
+class SyncSampler(BaseSampler):
+    def __init__(
+        self,
+        args: SampleArguments,
+        model_args: ModelArguments,
+        model: HFModel,
+        renderer: Renderer,
+    ) -> None:
+        def _start_background_loop(loop: asyncio.AbstractEventLoop) -> None:
+            asyncio.set_event_loop(loop)
+            loop.run_forever()
+
+        super().__init__(args, model_args, model, renderer)
+        self._loop = asyncio.new_event_loop()
+        self._thread = Thread(target=_start_background_loop, args=(self._loop,), daemon=True)
+        self._thread.start()
+
+    def generate(self, messages: list[Message], tools: str | None = None) -> Generator[str, None, None]:
+        """Generate tokens synchronously.
+
+        Args:
+            messages: List of messages.
+            tools: Tools string.
+
+        Yields:
+            Generated tokens.
+        """
+        generator = super().generate(messages, tools)
+        while True:
+            try:
+                token = asyncio.run_coroutine_threadsafe(generator.__anext__(), self._loop).result()
+                yield token
+            except StopAsyncIteration:
+                break
+
+    def batch_infer(self, dataset: TorchDataset) -> list[Sample]:
+        """Batch infer samples synchronously.
+
+        Args:
+            dataset: Torch dataset.
+
+        Returns:
+            List of samples.
+        """
+        return asyncio.run_coroutine_threadsafe(super().batch_infer(dataset), self._loop).result()
+
+
+def run_chat(args: InputArgument = None):
+    model_args, data_args, _, sample_args = get_args(args)
+    if sample_args.sample_backend != SampleBackend.HF:
+        model_args.init_plugin = {"name": "init_on_meta"}
+
+    model_engine = ModelEngine(model_args)
+    sampler = SyncSampler(sample_args, model_args, model_engine.model, model_engine.renderer)
+    if data_args.train_dataset is not None:
+        dataset = DataEngine(data_args.train_dataset)
+        sampler.batch_infer(dataset)
+    else:
+        if os.name != "nt":
+            try:
+                import readline  # noqa: F401
+            except ImportError:
+                print("Install `readline` for a better experience.")
+
+        messages = []
+        print("Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application.")
+
+        while True:
+            try:
+                query = input("\nUser: ")
+            except UnicodeDecodeError:
+                print("Detected decoding error at the inputs, please set the terminal encoding to utf-8.")
+                continue
+            except Exception:
+                raise
+
+            if query.strip() == "exit":
+                break
+
+            if query.strip() == "clear":
+                messages = []
+                print("History has been removed.")
+                continue
+
+            messages.append({"role": "user", "content": [{"type": "text", "value": query}]})
+            print("Assistant: ", end="", flush=True)
+
+            response = ""
+            for new_text in sampler.generate(messages):
+                print(new_text, end="", flush=True)
+                response += new_text
+
+            print()
+            messages.append(model_engine.renderer.parse_message(response))
+
+
+if __name__ == "__main__":
+    run_chat()
diff --git a/LlamaFactory/src/llamafactory/v1/trainers/__init__.py b/LlamaFactory/src/llamafactory/v1/trainers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/trainers/dpo_trainer.py b/LlamaFactory/src/llamafactory/v1/trainers/dpo_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/trainers/rm_trainer.py b/LlamaFactory/src/llamafactory/v1/trainers/rm_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/trainers/sft_trainer.py b/LlamaFactory/src/llamafactory/v1/trainers/sft_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..898b54f98aa81dc8aac34fcec1cad0d6a069eac2
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/trainers/sft_trainer.py
@@ -0,0 +1,52 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ..accelerator.interface import DistributedInterface
+from ..config import InputArgument, get_args
+from ..core.base_trainer import BaseTrainer
+from ..core.data_engine import DataEngine
+from ..core.model_engine import ModelEngine
+from ..utils.types import BatchInput, Tensor
+
+
+class SFTTrainer(BaseTrainer):
+    def compute_loss(self, batch: BatchInput) -> Tensor:
+        shift_loss_weights = batch["loss_weights"].to(self.device, non_blocking=True)[..., 1:]
+        log_probs = self.compute_log_probs(self.model, batch)
+        loss = (-log_probs * shift_loss_weights).sum() / (shift_loss_weights.sum() + 1e-6)
+        return loss
+
+
+def run_sft(args: InputArgument = None):
+    model_args, data_args, training_args, _ = get_args(args)
+    DistributedInterface(training_args.dist_config)
+    train_dataset = DataEngine(data_args.train_dataset)
+    model_engine = ModelEngine(model_args)
+    trainer = SFTTrainer(
+        args=training_args,
+        model=model_engine.model,
+        renderer=model_engine.renderer,
+        train_dataset=train_dataset,
+    )
+    trainer.fit()
+    trainer.save_model()
+    DistributedInterface().destroy()
+
+
+if __name__ == "__main__":
+    """
+    python -m llamafactory.v1.trainers.sft_trainer --model Qwen/Qwen3-0.6B --train_dataset data/v1_sft_demo.yaml
+    """
+    run_sft()
diff --git a/LlamaFactory/src/llamafactory/v1/utils/__init__.py b/LlamaFactory/src/llamafactory/v1/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/v1/utils/constants.py b/LlamaFactory/src/llamafactory/v1/utils/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ec68b44d399b2b507acd716f74eba72a342ee5e
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/utils/constants.py
@@ -0,0 +1,15 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IGNORE_INDEX = -100
diff --git a/LlamaFactory/src/llamafactory/v1/utils/dtype.py b/LlamaFactory/src/llamafactory/v1/utils/dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..331c9bddf2be56eae17ea414fc95b4f5de36edca
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/utils/dtype.py
@@ -0,0 +1,91 @@
+# Copyright 2025 Bytedance Ltd. and the LlamaFactory team.
+#
+# This code is inspired by the Bytedance's verl library.
+# https://github.com/volcengine/verl/blob/v0.6.1/verl/utils/torch_dtypes.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from contextlib import contextmanager
+
+import torch
+from transformers.utils import is_torch_bf16_available_on_device, is_torch_fp16_available_on_device
+
+from ..accelerator.interface import DistributedInterface
+
+
+class DtypeRegistry:
+    HALF_LIST = ["fp16", "float16", "half", torch.float16]
+    FLOAT_LIST = ["fp32", "float32", "float", torch.float32]
+    BFLOAT_LIST = ["bf16", "bfloat16", torch.bfloat16]
+
+
+class DtypeInterface:
+    """Type of precision used."""
+
+    _is_fp16_available = is_torch_fp16_available_on_device(DistributedInterface().current_device)
+    _is_bf16_available = is_torch_bf16_available_on_device(DistributedInterface().current_device)
+    _is_fp32_available = True
+
+    @staticmethod
+    def is_available(precision: str | torch.dtype) -> bool:
+        if precision in DtypeRegistry.HALF_LIST:
+            return DtypeInterface._is_fp16_available
+        elif precision in DtypeRegistry.FLOAT_LIST:
+            return DtypeInterface._is_fp32_available
+        elif precision in DtypeRegistry.BFLOAT_LIST:
+            return DtypeInterface._is_bf16_available
+        else:
+            raise RuntimeError(f"Unexpected precision: {precision}")
+
+    @staticmethod
+    def is_fp16(precision: str | torch.dtype) -> bool:
+        return precision in DtypeRegistry.HALF_LIST
+
+    @staticmethod
+    def is_fp32(precision: str | torch.dtype) -> bool:
+        return precision in DtypeRegistry.FLOAT_LIST
+
+    @staticmethod
+    def is_bf16(precision: str | torch.dtype) -> bool:
+        return precision in DtypeRegistry.BFLOAT_LIST
+
+    @staticmethod
+    def to_dtype(precision: str | torch.dtype) -> torch.dtype:
+        if precision in DtypeRegistry.HALF_LIST:
+            return torch.float16
+        elif precision in DtypeRegistry.FLOAT_LIST:
+            return torch.float32
+        elif precision in DtypeRegistry.BFLOAT_LIST:
+            return torch.bfloat16
+        else:
+            raise RuntimeError(f"Unexpected precision: {precision}")
+
+    @staticmethod
+    def to_str(precision: torch.dtype) -> str:
+        if precision == torch.float16:
+            return "float16"
+        elif precision == torch.float32:
+            return "float32"
+        elif precision == torch.bfloat16:
+            return "bfloat16"
+        else:
+            raise RuntimeError(f"Unexpected precision: {precision}")
+
+    @contextmanager
+    def set_dtype(self, precision: str | torch.dtype):
+        original_dtype = torch.get_default_dtype()
+        torch.set_default_dtype(self.to_dtype(precision))
+        try:
+            yield
+        finally:
+            torch.set_default_dtype(original_dtype)
diff --git a/LlamaFactory/src/llamafactory/v1/utils/env.py b/LlamaFactory/src/llamafactory/v1/utils/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc17d7cc92f60149f48f9b74d693956f12be8218
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/utils/env.py
@@ -0,0 +1,38 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import socket
+
+
+def find_available_port() -> int:
+    """Find an available port on the local machine."""
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.bind(("", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    return port
+
+
+def is_env_enabled(env_var: str, default: str = "0") -> bool:
+    """Check if the environment variable is enabled."""
+    return os.getenv(env_var, default).lower() in ["true", "yes", "on", "t", "y", "1"]
+
+
+def use_ray() -> bool:
+    return False
+
+
+def use_kt() -> bool:
+    return False
diff --git a/LlamaFactory/src/llamafactory/v1/utils/helper.py b/LlamaFactory/src/llamafactory/v1/utils/helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f7b755059dc135f23da5776223e69024ac3813b
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/utils/helper.py
@@ -0,0 +1,93 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+from transformers import PreTrainedTokenizer
+
+from ..accelerator.interface import DistributedInterface
+from .constants import IGNORE_INDEX
+from .types import BatchInput, ModelInput, Processor, Tensor
+
+
+def is_tokenizer(processor: Processor) -> bool:
+    """Check if processor is tokenizer.
+
+    Args:
+        processor: Processor.
+
+    Returns:
+        Whether processor is tokenizer.
+    """
+    return not hasattr(processor, "tokenizer")
+
+
+def get_tokenizer(processor: Processor) -> PreTrainedTokenizer:
+    """Get tokenizer from processor.
+
+    Args:
+        processor: Processor.
+
+    Returns:
+        Tokenizer.
+    """
+    return processor.tokenizer if hasattr(processor, "tokenizer") else processor
+
+
+def _pad_and_truncate(tensor: Tensor, max_seqlen: int, pad_value: int = 0) -> Tensor:
+    if tensor.shape[-1] >= max_seqlen:
+        return tensor[..., :max_seqlen]
+
+    pad_shape = list(tensor.shape)
+    pad_shape[-1] = max_seqlen - tensor.shape[-1]
+    pad_tensor = torch.full(pad_shape, pad_value, dtype=tensor.dtype, device=tensor.device)
+    return torch.cat([tensor, pad_tensor], dim=-1)
+
+
+def pad_and_truncate(samples: list[ModelInput], max_seqlen: int) -> list[BatchInput]:
+    max_length = min(max(len(sample["input_ids"]) for sample in samples), max_seqlen)
+    padded_samples = []
+    for sample in samples:
+        padded_sample = {}
+        for key, value in sample.items():
+            if "label" in key:
+                pad_value = IGNORE_INDEX
+            else:
+                pad_value = 0
+
+            if not isinstance(value, str):
+                padded_sample[key] = _pad_and_truncate(torch.tensor(value), max_length, pad_value)
+            else:
+                padded_sample[key] = value
+
+        padded_samples.append(padded_sample)
+
+    return padded_samples
+
+
+def compute_valid_tokens(batches: list[BatchInput]) -> int:
+    """Compute valid tokens in batches.
+
+    Args:
+        batches: Batches.
+
+    Returns:
+        Number of valid tokens.
+    """
+    device = DistributedInterface().current_device
+    return sum(
+        (batch["labels"].to(device, non_blocking=True) != IGNORE_INDEX).sum().item()
+        for batch in batches
+        if "labels" in batch
+    )
diff --git a/LlamaFactory/src/llamafactory/v1/utils/logging.py b/LlamaFactory/src/llamafactory/v1/utils/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d38927ff9edaa7797bd1e06143d29b5ba213a07
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/utils/logging.py
@@ -0,0 +1,123 @@
+# Copyright 2025 Optuna, HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v5.0.0rc0/src/transformers/utils/logging.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import threading
+from functools import lru_cache
+from typing import Optional
+
+
+_thread_lock = threading.RLock()
+_default_handler: Optional["logging.Handler"] = None
+_default_log_level: "logging._Level" = logging.INFO
+
+
+class _Logger(logging.Logger):
+    """A logger that supports rank0 logging."""
+
+    def info_rank0(self, *args, **kwargs) -> None:
+        self.info(*args, **kwargs)
+
+    def warning_rank0(self, *args, **kwargs) -> None:
+        self.warning(*args, **kwargs)
+
+    def warning_rank0_once(self, *args, **kwargs) -> None:
+        self.warning(*args, **kwargs)
+
+
+def _get_default_logging_level() -> "logging._Level":
+    """Return the default logging level."""
+    env_level_str = os.getenv("LLAMAFACTORY_VERBOSITY", None)
+    if env_level_str:
+        if env_level_str.upper() in logging._nameToLevel:
+            return logging._nameToLevel[env_level_str.upper()]
+        else:
+            raise ValueError(f"Unknown logging level: {env_level_str}.")
+
+    return _default_log_level
+
+
+def _get_library_name() -> str:
+    return ".".join(__name__.split(".")[:2])  # llamafactory.v1
+
+
+def _get_library_root_logger() -> "_Logger":
+    return logging.getLogger(_get_library_name())
+
+
+def _configure_library_root_logger() -> None:
+    """Configure root logger using a stdout stream handler with an explicit format."""
+    global _default_handler
+
+    with _thread_lock:
+        if _default_handler:  # already configured
+            return
+
+        formatter = logging.Formatter(
+            fmt="[%(levelname)s|%(asctime)s] %(name)s:%(lineno)s >> %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+        _default_handler = logging.StreamHandler(sys.stdout)
+        _default_handler.setFormatter(formatter)
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.addHandler(_default_handler)
+        library_root_logger.setLevel(_get_default_logging_level())
+        library_root_logger.propagate = False
+
+
+def get_logger(name: str | None = None) -> "_Logger":
+    """Return a logger with the specified name. It it not supposed to be accessed externally."""
+    if name is None:
+        name = _get_library_name()
+
+    _configure_library_root_logger()
+    return logging.getLogger(name)
+
+
+def add_handler(handler: "logging.Handler") -> None:
+    """Add a handler to the root logger."""
+    _configure_library_root_logger()
+    _get_library_root_logger().addHandler(handler)
+
+
+def remove_handler(handler: logging.Handler) -> None:
+    """Remove a handler to the root logger."""
+    _configure_library_root_logger()
+    _get_library_root_logger().removeHandler(handler)
+
+
+def info_rank0(self: "logging.Logger", *args, **kwargs) -> None:
+    if int(os.getenv("LOCAL_RANK", "0")) == 0:
+        self.info(*args, **kwargs)
+
+
+def warning_rank0(self: "logging.Logger", *args, **kwargs) -> None:
+    if int(os.getenv("LOCAL_RANK", "0")) == 0:
+        self.warning(*args, **kwargs)
+
+
+@lru_cache(None)
+def warning_rank0_once(self: "logging.Logger", *args, **kwargs) -> None:
+    if int(os.getenv("LOCAL_RANK", "0")) == 0:
+        self.warning(*args, **kwargs)
+
+
+logging.Logger.info_rank0 = info_rank0
+logging.Logger.warning_rank0 = warning_rank0
+logging.Logger.warning_rank0_once = warning_rank0_once
diff --git a/LlamaFactory/src/llamafactory/v1/utils/objects.py b/LlamaFactory/src/llamafactory/v1/utils/objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..338f52365533ce312468f1d3195f4c808cf99ad9
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/utils/objects.py
@@ -0,0 +1,67 @@
+# Copyright 2025 Optuna, HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v5.0.0rc0/src/transformers/utils/logging.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .types import ModelInput
+
+
+class StatefulBuffer:
+    """A buffer that stores model inputs."""
+
+    def __init__(self, max_buffer_size: int = 1_000_000_000) -> None:
+        self._buffer: list[ModelInput] = []
+        self._buffer_size: int = 0
+        self._max_buffer_size: int = max_buffer_size
+
+    def __len__(self) -> int:
+        return len(self._buffer)
+
+    @property
+    def size(self) -> int:
+        return self._buffer_size
+
+    def put(self, samples: list[ModelInput]) -> None:
+        """Add samples to the buffer."""
+        num_tokens = sum(len(sample["input_ids"]) for sample in samples)
+        if self._buffer_size + num_tokens > self._max_buffer_size:
+            raise ValueError(f"Buffer size exceeds max buffer size {self._max_buffer_size}.")
+
+        self._buffer.extend(samples)
+        self._buffer_size += num_tokens
+
+    def get(self, value: int) -> list[ModelInput]:
+        """Get samples from the buffer and remove them."""
+        samples = self._buffer[:value]
+        self._buffer_size -= sum(len(sample["input_ids"]) for sample in samples)
+        del self._buffer[:value]
+        return samples
+
+    def clear(self) -> None:
+        """Clear the buffer."""
+        self._buffer = []
+        self._buffer_size = 0
+
+    def state_dict(self) -> dict:
+        """Returns the state of the buffer."""
+        return {
+            "buffer": self._buffer,
+            "buffer_size": self._buffer_size,
+        }
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        """Loads the state into the buffer."""
+        self._buffer = state_dict["buffer"]
+        self._buffer_size = state_dict["buffer_size"]
diff --git a/LlamaFactory/src/llamafactory/v1/utils/packages.py b/LlamaFactory/src/llamafactory/v1/utils/packages.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d86e01a8b7fec044d58b84daf3cc54f060a6870
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/utils/packages.py
@@ -0,0 +1,43 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/utils/import_utils.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.metadata
+import importlib.util
+from functools import lru_cache
+from typing import TYPE_CHECKING
+
+from packaging import version
+
+
+if TYPE_CHECKING:
+    from packaging.version import Version
+
+
+def _is_package_available(name: str) -> bool:
+    return importlib.util.find_spec(name) is not None
+
+
+def _get_package_version(name: str) -> "Version":
+    try:
+        return version.parse(importlib.metadata.version(name))
+    except Exception:
+        return version.parse("0.0.0")
+
+
+@lru_cache
+def is_transformers_version_greater_than(content: str):
+    return _get_package_version("transformers") >= version.parse(content)
diff --git a/LlamaFactory/src/llamafactory/v1/utils/plugin.py b/LlamaFactory/src/llamafactory/v1/utils/plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c06c88d33519b38f8ea6b9cb6e5f38ccb29d420
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/utils/plugin.py
@@ -0,0 +1,107 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from collections import defaultdict
+from collections.abc import Callable
+from typing import Any
+
+from . import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class BasePlugin:
+    """Base class for plugins.
+
+    A plugin is a callable object that can be registered and called by name.
+
+    Example usage:
+    ```python
+    class PrintPlugin(BasePlugin):
+        def again(self):  # optional
+            self["again"]()
+
+
+    @PrintPlugin("hello").register()
+    def print_hello():
+        print("Hello world!")
+
+
+    @PrintPlugin("hello").register("again")
+    def print_hello_again():
+        print("Hello world! Again.")
+
+
+    PrintPlugin("hello")()
+    PrintPlugin("hello").again()
+    ```
+    """
+
+    _registry: dict[str, dict[str, Callable]] = defaultdict(dict)
+
+    def __init__(self, name: str | None = None) -> None:
+        """Initialize the plugin with a name."""
+        self.name = name
+
+    def register(self, method_name: str = "__call__") -> Callable:
+        """Decorator to register a function as a plugin."""
+        if self.name is None:
+            raise ValueError("Plugin name should be specified.")
+
+        if method_name in self._registry[self.name]:
+            logger.warning_rank0_once(f"Method {method_name} of plugin {self.name} is already registered.")
+
+        def decorator(func: Callable) -> Callable:
+            self._registry[self.name][method_name] = func
+            return func
+
+        return decorator
+
+    def __call__(self, *args, **kwargs) -> Any:
+        """Call the registered function with the given arguments."""
+        return self["__call__"](*args, **kwargs)
+
+    def __getattr__(self, method_name: str) -> Callable:
+        """Get the registered function with the given name."""
+        return self[method_name]
+
+    def __getitem__(self, method_name: str) -> Callable:
+        """Get the registered function with the given name."""
+        if method_name not in self._registry[self.name]:
+            raise ValueError(f"Method {method_name} of plugin {self.name} is not registered.")
+
+        return self._registry[self.name][method_name]
+
+
+if __name__ == "__main__":
+    """
+    python -m llamafactory.v1.utils.plugin
+    """
+
+    class PrintPlugin(BasePlugin):
+        def again(self):  # optional
+            self["again"]()
+
+    @PrintPlugin("hello").register()
+    def print_hello():
+        print("Hello world!")
+
+    @PrintPlugin("hello").register("again")
+    def print_hello_again():
+        print("Hello world! Again.")
+
+    PrintPlugin("hello")()
+    PrintPlugin("hello").again()
diff --git a/LlamaFactory/src/llamafactory/v1/utils/pytest.py b/LlamaFactory/src/llamafactory/v1/utils/pytest.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbbaa08cf0aaf8180d9fe02f56d236e40d64b3bd
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/utils/pytest.py
@@ -0,0 +1,35 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from contextlib import contextmanager
+
+
+@contextmanager
+def dist_env(local_rank: int = 0, world_size: int = 1, master_port: int = 25595):
+    """Set distributed environment variables."""
+    env_vars = {
+        "MASTER_ADDR": "127.0.0.1",
+        "MASTER_PORT": str(master_port),
+        "RANK": str(local_rank),
+        "LOCAL_RANK": str(local_rank),
+        "WORLD_SIZE": str(world_size),
+        "LOCAL_WORLD_SIZE": str(world_size),
+    }
+    os.environ.update(env_vars)
+    try:
+        yield
+    finally:
+        for key in env_vars.keys():
+            os.environ.pop(key, None)
diff --git a/LlamaFactory/src/llamafactory/v1/utils/types.py b/LlamaFactory/src/llamafactory/v1/utils/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f39069686f8bb55142c7df176afff08eba2e04c
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/v1/utils/types.py
@@ -0,0 +1,180 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Iterator
+from typing import TYPE_CHECKING, Any, Literal, NamedTuple, NotRequired, TypedDict, Union
+
+
+if TYPE_CHECKING:
+    import datasets
+    import numpy as np
+    import torch
+    import torch.utils.data
+    import transformers
+    from torch.distributed import ProcessGroup
+    from torch.distributed.fsdp import FullyShardedDataParallel
+
+    Tensor = torch.Tensor
+    TensorLike = Union[int, float, list[int], list[float], np.ndarray, Tensor]
+    TorchDataset = Union[torch.utils.data.Dataset, torch.utils.data.IterableDataset]
+    HFDataset = Union[datasets.Dataset, datasets.IterableDataset]
+    DataCollator = transformers.DataCollator
+    DataLoader = torch.utils.data.DataLoader
+    HFConfig = transformers.PretrainedConfig
+    HFModel = transformers.PreTrainedModel
+    DistModel = Union[torch.nn.parallel.DistributedDataParallel, FullyShardedDataParallel]
+    Processor = Union[transformers.PreTrainedTokenizer, transformers.ProcessorMixin]
+    Optimizer = torch.optim.Optimizer
+    Scheduler = torch.optim.lr_scheduler.LRScheduler
+    ProcessGroup = ProcessGroup
+else:
+    Tensor = None
+    TensorLike = None
+    TorchDataset = None
+    HFDataset = None
+    DataCollator = None
+    DataLoader = None
+    HFConfig = None
+    HFModel = None
+    DistModel = None
+    Processor = None
+    Optimizer = None
+    Scheduler = None
+    ProcessGroup = None
+
+
+class DatasetInfo(TypedDict, total=False):
+    path: str
+    """Local file path."""
+    source: NotRequired[Literal["hf_hub", "ms_hub", "local"]]
+    """Dataset source, default to "hf_hub"."""
+    split: NotRequired[str]
+    """Dataset split, default to "train"."""
+    converter: NotRequired[str]
+    """Dataset converter, default to None."""
+    size: NotRequired[int]
+    """Number of samples, default to all samples."""
+    weight: NotRequired[float]
+    """Dataset weight, default to 1.0."""
+    streaming: NotRequired[bool]
+    """Is streaming dataset, default to False."""
+
+
+class DistributedConfig(TypedDict, total=False):
+    mp_replicate_size: NotRequired[int]
+    """Model parallel replicate size, default to 1."""
+    mp_shard_size: NotRequired[int]
+    """Model parallel shard size, default to world_size // mp_replicate_size."""
+    dp_size: NotRequired[int]
+    """Data parallel size, default to world_size // cp_size."""
+    cp_size: NotRequired[int]
+    """Context parallel size, default to 1."""
+    timeout: NotRequired[int]
+    """Timeout for distributed communication, default to 600."""
+
+
+class Content(TypedDict):
+    type: Literal["text", "reasoning", "tool_call", "image_url"]
+    """Type of the content."""
+    value: str
+    """Value of the content."""
+
+
+class Message(TypedDict):
+    role: Literal["system", "user", "assistant", "tool"]
+    """Role of the message."""
+    content: list[Content]
+    """Content of the message."""
+    loss_weight: NotRequired[float]
+    """Loss weight for this message, default to 1.0. Required in training."""
+
+
+class SFTSample(TypedDict):
+    messages: list[Message]
+    """Messages in the sample."""
+    tools: NotRequired[str]
+    """Tools for the sample in JSON string format."""
+    extra_info: NotRequired[str]
+    """Extra information for the sample, e.g. kto_labels."""
+    _dataset_name: NotRequired[str]
+    """Dataset name for the sample."""
+
+
+class DPOSample(TypedDict):
+    chosen_messages: list[Message]
+    """Chosen messages in the sample."""
+    rejected_messages: list[Message]
+    """Rejected messages in the sample."""
+    tools: NotRequired[str]
+    """Tools for the sample in JSON string format."""
+    extra_info: NotRequired[str]
+    """Extra information for the sample, e.g. kto_labels."""
+    _dataset_name: NotRequired[str]
+    """Dataset name for the sample."""
+
+
+Sample = Union[SFTSample, DPOSample]
+
+
+class ToolCall(TypedDict):
+    name: str
+    """Function name."""
+    arguments: dict[str, Any]
+    """Function arguments."""
+
+
+class ModelInput(TypedDict, total=False):
+    input_ids: list[int]
+    """Input ids for the model."""
+    attention_mask: list[int]
+    """Attention mask for the model."""
+    labels: list[int]
+    """Labels for the model."""
+    loss_weights: list[float]
+    """Loss weight for each token, default to 1.0."""
+    position_ids: NotRequired[list[int] | list[list[int]]]
+    """Position ids for the model (optional)."""
+    token_type_ids: NotRequired[list[int]]
+    """Token type ids used in DPO, 1 represents the chosen messages, 2 represents the rejected messages."""
+
+
+class BatchInput(TypedDict, total=False):
+    input_ids: Tensor
+    """Input ids for the model."""
+    attention_mask: Tensor
+    """Attention mask for the model."""
+    labels: Tensor
+    """Labels for the model."""
+    loss_weights: Tensor
+    """Loss weight for each token, default to 1.0."""
+    position_ids: NotRequired[Tensor]
+    """Position ids for the model (optional)."""
+    token_type_ids: NotRequired[Tensor]
+    """Token type ids used in DPO, 1 represents the chosen messages, 2 represents the rejected messages."""
+
+
+class BatchInfo(TypedDict):
+    micro_batch_size: int
+    """Micro batch size."""
+    num_micro_batch: int
+    """Number of micro batches."""
+    cutoff_len: int
+    """Cutoff length."""
+    data_iter: Iterator[list[ModelInput]]
+    """Data iterator."""
+
+
+class ModelOutput(NamedTuple):
+    logits: Tensor
+    """Logits for the model."""
diff --git a/LlamaFactory/src/llamafactory/webui/__init__.py b/LlamaFactory/src/llamafactory/webui/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/LlamaFactory/src/llamafactory/webui/chatter.py b/LlamaFactory/src/llamafactory/webui/chatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e86505b4b2e36cd28a27f65297778259dc67adc5
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/chatter.py
@@ -0,0 +1,246 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections.abc import Generator
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Any
+
+from transformers.utils import is_torch_npu_available
+
+from ..chat import ChatModel
+from ..data import Role
+from ..extras.constants import PEFT_METHODS
+from ..extras.misc import torch_gc
+from ..extras.packages import is_gradio_available
+from .common import get_save_dir, load_config
+from .locales import ALERTS
+
+
+if TYPE_CHECKING:
+    from ..chat import BaseEngine
+    from .manager import Manager
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+def _escape_html(text: str) -> str:
+    r"""Escape HTML characters."""
+    return text.replace("<", "&lt;").replace(">", "&gt;")
+
+
+def _format_response(text: str, lang: str, escape_html: bool, thought_words: tuple[str, str]) -> str:
+    r"""Post-process the response text.
+
+    Based on: https://huggingface.co/spaces/Lyte/DeepSeek-R1-Distill-Qwen-1.5B-Demo-GGUF/blob/main/app.py
+    """
+    if thought_words[0] not in text:
+        return _escape_html(text) if escape_html else text
+
+    text = text.replace(thought_words[0], "")
+    result = text.split(thought_words[1], maxsplit=1)
+    if len(result) == 1:
+        summary = ALERTS["info_thinking"][lang]
+        thought, answer = text, ""
+    else:
+        summary = ALERTS["info_thought"][lang]
+        thought, answer = result
+
+    if escape_html:
+        thought, answer = _escape_html(thought), _escape_html(answer)
+
+    return (
+        f"<details open><summary class='thinking-summary'><span>{summary}</span></summary>\n\n"
+        f"<div class='thinking-container'>\n{thought}\n</div>\n</details>{answer}"
+    )
+
+
+@contextmanager
+def update_attr(obj: Any, name: str, value: Any):
+    old_value = getattr(obj, name, None)
+    setattr(obj, name, value)
+    yield
+    setattr(obj, name, old_value)
+
+
+class WebChatModel(ChatModel):
+    def __init__(self, manager: "Manager", demo_mode: bool = False, lazy_init: bool = True) -> None:
+        self.manager = manager
+        self.demo_mode = demo_mode
+        self.engine: BaseEngine | None = None
+
+        if not lazy_init:  # read arguments from command line
+            super().__init__()
+
+        if demo_mode and os.getenv("DEMO_MODEL") and os.getenv("DEMO_TEMPLATE"):  # load demo model
+            model_name_or_path = os.getenv("DEMO_MODEL")
+            template = os.getenv("DEMO_TEMPLATE")
+            infer_backend = os.getenv("DEMO_BACKEND", "huggingface")
+            super().__init__(
+                dict(model_name_or_path=model_name_or_path, template=template, infer_backend=infer_backend)
+            )
+
+    @property
+    def loaded(self) -> bool:
+        return self.engine is not None
+
+    def load_model(self, data) -> Generator[str, None, None]:
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
+        lang, model_name, model_path = get("top.lang"), get("top.model_name"), get("top.model_path")
+        finetuning_type, checkpoint_path = get("top.finetuning_type"), get("top.checkpoint_path")
+        user_config = load_config()
+
+        error = ""
+        if self.loaded:
+            error = ALERTS["err_exists"][lang]
+        elif not model_name:
+            error = ALERTS["err_no_model"][lang]
+        elif not model_path:
+            error = ALERTS["err_no_path"][lang]
+        elif self.demo_mode:
+            error = ALERTS["err_demo"][lang]
+
+        try:
+            json.loads(get("infer.extra_args"))
+        except json.JSONDecodeError:
+            error = ALERTS["err_json_schema"][lang]
+
+        if error:
+            gr.Warning(error)
+            yield error
+            return
+
+        yield ALERTS["info_loading"][lang]
+        args = dict(
+            model_name_or_path=model_path,
+            cache_dir=user_config.get("cache_dir", None),
+            finetuning_type=finetuning_type,
+            template=get("top.template"),
+            rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") != "none" else None,
+            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
+            use_unsloth=(get("top.booster") == "unsloth"),
+            enable_liger_kernel=(get("top.booster") == "liger_kernel"),
+            infer_backend=get("infer.infer_backend"),
+            infer_dtype=get("infer.infer_dtype"),
+            trust_remote_code=True,
+        )
+        args.update(json.loads(get("infer.extra_args")))
+
+        # checkpoints
+        if checkpoint_path:
+            if finetuning_type in PEFT_METHODS:  # list
+                args["adapter_name_or_path"] = ",".join(
+                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in checkpoint_path]
+                )
+            else:  # str
+                args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, checkpoint_path)
+
+        # quantization
+        if get("top.quantization_bit") != "none":
+            args["quantization_bit"] = int(get("top.quantization_bit"))
+            args["quantization_method"] = get("top.quantization_method")
+            args["double_quantization"] = not is_torch_npu_available()
+
+        super().__init__(args)
+        yield ALERTS["info_loaded"][lang]
+
+    def unload_model(self, data) -> Generator[str, None, None]:
+        lang = data[self.manager.get_elem_by_id("top.lang")]
+
+        if self.demo_mode:
+            gr.Warning(ALERTS["err_demo"][lang])
+            yield ALERTS["err_demo"][lang]
+            return
+
+        yield ALERTS["info_unloading"][lang]
+        self.engine = None
+        torch_gc()
+        yield ALERTS["info_unloaded"][lang]
+
+    @staticmethod
+    def append(
+        chatbot: list[dict[str, str]],
+        messages: list[dict[str, str]],
+        role: str,
+        query: str,
+        escape_html: bool,
+    ) -> tuple[list[dict[str, str]], list[dict[str, str]], str]:
+        r"""Add the user input to chatbot.
+
+        Inputs: infer.chatbot, infer.messages, infer.role, infer.query, infer.escape_html
+        Output: infer.chatbot, infer.messages, infer.query
+        """
+        return (
+            chatbot + [{"role": "user", "content": _escape_html(query) if escape_html else query}],
+            messages + [{"role": role, "content": query}],
+            "",
+        )
+
+    def stream(
+        self,
+        chatbot: list[dict[str, str]],
+        messages: list[dict[str, str]],
+        lang: str,
+        system: str,
+        tools: str,
+        image: Any | None,
+        video: Any | None,
+        audio: Any | None,
+        max_new_tokens: int,
+        top_p: float,
+        temperature: float,
+        skip_special_tokens: bool,
+        escape_html: bool,
+        enable_thinking: bool,
+    ) -> Generator[tuple[list[dict[str, str]], list[dict[str, str]]], None, None]:
+        r"""Generate output text in stream.
+
+        Inputs: infer.chatbot, infer.messages, infer.system, infer.tools, infer.image, infer.video, ...
+        Output: infer.chatbot, infer.messages
+        """
+        with update_attr(self.engine.template, "enable_thinking", enable_thinking):
+            chatbot.append({"role": "assistant", "content": ""})
+            response = ""
+            for new_text in self.stream_chat(
+                messages,
+                system,
+                tools,
+                images=[image] if image else None,
+                videos=[video] if video else None,
+                audios=[audio] if audio else None,
+                max_new_tokens=max_new_tokens,
+                top_p=top_p,
+                temperature=temperature,
+                skip_special_tokens=skip_special_tokens,
+            ):
+                response += new_text
+                if tools:
+                    result = self.engine.template.extract_tool(response)
+                else:
+                    result = response
+
+                if isinstance(result, list):
+                    tool_calls = [{"name": tool.name, "arguments": json.loads(tool.arguments)} for tool in result]
+                    tool_calls = json.dumps(tool_calls, ensure_ascii=False)
+                    output_messages = messages + [{"role": Role.FUNCTION.value, "content": tool_calls}]
+                    bot_text = "```json\n" + tool_calls + "\n```"
+                else:
+                    output_messages = messages + [{"role": Role.ASSISTANT.value, "content": result}]
+                    bot_text = _format_response(result, lang, escape_html, self.engine.template.thought_words)
+
+                chatbot[-1] = {"role": "assistant", "content": bot_text}
+                yield chatbot, output_messages
diff --git a/LlamaFactory/src/llamafactory/webui/common.py b/LlamaFactory/src/llamafactory/webui/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..cacf15182b8d9e7898facd26fb4c06d632a2aee7
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/common.py
@@ -0,0 +1,286 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import signal
+from collections import defaultdict
+from datetime import datetime
+from typing import Any
+
+from psutil import Process
+from yaml import safe_dump, safe_load
+
+from ..extras import logging
+from ..extras.constants import (
+    DATA_CONFIG,
+    DEFAULT_TEMPLATE,
+    MULTIMODAL_SUPPORTED_MODELS,
+    SUPPORTED_MODELS,
+    TRAINING_ARGS,
+    DownloadSource,
+)
+from ..extras.misc import use_modelscope, use_openmind
+
+
+logger = logging.get_logger(__name__)
+
+DEFAULT_CACHE_DIR = "llamaboard_cache"
+DEFAULT_CONFIG_DIR = "llamaboard_config"
+DEFAULT_DATA_DIR = "data"
+DEFAULT_SAVE_DIR = "saves"
+USER_CONFIG = "user_config.yaml"
+
+
+def abort_process(pid: int) -> None:
+    r"""Abort the processes recursively in a bottom-up way."""
+    try:
+        children = Process(pid).children()
+        if children:
+            for child in children:
+                abort_process(child.pid)
+
+        os.kill(pid, signal.SIGABRT)
+    except Exception:
+        pass
+
+
+def get_save_dir(*paths: str) -> os.PathLike:
+    r"""Get the path to saved model checkpoints."""
+    if os.path.sep in paths[-1]:
+        logger.warning_rank0("Found complex path, some features may be not available.")
+        return paths[-1]
+
+    paths = (path.replace(" ", "").strip() for path in paths)
+    return os.path.join(DEFAULT_SAVE_DIR, *paths)
+
+
+def _get_config_path() -> os.PathLike:
+    r"""Get the path to user config."""
+    return os.path.join(DEFAULT_CACHE_DIR, USER_CONFIG)
+
+
+def load_config() -> dict[str, str | dict[str, Any]]:
+    r"""Load user config if exists."""
+    try:
+        with open(_get_config_path(), encoding="utf-8") as f:
+            return safe_load(f)
+    except Exception:
+        return {"lang": None, "hub_name": None, "last_model": None, "path_dict": {}, "cache_dir": None}
+
+
+def save_config(
+    lang: str, hub_name: str | None = None, model_name: str | None = None, model_path: str | None = None
+) -> None:
+    r"""Save user config."""
+    os.makedirs(DEFAULT_CACHE_DIR, exist_ok=True)
+    user_config = load_config()
+    user_config["lang"] = lang or user_config["lang"]
+    if hub_name:
+        user_config["hub_name"] = hub_name
+
+    if model_name:
+        user_config["last_model"] = model_name
+
+    if model_name and model_path:
+        user_config["path_dict"][model_name] = model_path
+
+    with open(_get_config_path(), "w", encoding="utf-8") as f:
+        safe_dump(user_config, f)
+
+
+def get_model_path(model_name: str) -> str:
+    r"""Get the model path according to the model name."""
+    user_config = load_config()
+    path_dict: dict[DownloadSource, str] = SUPPORTED_MODELS.get(model_name, defaultdict(str))
+    model_path = user_config["path_dict"].get(model_name, "") or path_dict.get(DownloadSource.DEFAULT, "")
+    if (
+        use_modelscope()
+        and path_dict.get(DownloadSource.MODELSCOPE)
+        and model_path == path_dict.get(DownloadSource.DEFAULT)
+    ):  # replace hf path with ms path
+        model_path = path_dict.get(DownloadSource.MODELSCOPE)
+
+    if (
+        use_openmind()
+        and path_dict.get(DownloadSource.OPENMIND)
+        and model_path == path_dict.get(DownloadSource.DEFAULT)
+    ):  # replace hf path with om path
+        model_path = path_dict.get(DownloadSource.OPENMIND)
+
+    return model_path
+
+
+def get_template(model_name: str) -> str:
+    r"""Get the template name if the model is a chat/distill/instruct model."""
+    return DEFAULT_TEMPLATE.get(model_name, "default")
+
+
+def get_time() -> str:
+    r"""Get current date and time."""
+    return datetime.now().strftime(r"%Y-%m-%d-%H-%M-%S")
+
+
+def is_multimodal(model_name: str) -> bool:
+    r"""Judge if the model is a vision language model."""
+    return model_name in MULTIMODAL_SUPPORTED_MODELS
+
+
+def load_dataset_info(dataset_dir: str) -> dict[str, dict[str, Any]]:
+    r"""Load dataset_info.json."""
+    if dataset_dir == "ONLINE" or dataset_dir.startswith("REMOTE:"):
+        logger.info_rank0(f"dataset_dir is {dataset_dir}, using online dataset.")
+        return {}
+
+    try:
+        with open(os.path.join(dataset_dir, DATA_CONFIG), encoding="utf-8") as f:
+            return json.load(f)
+    except Exception as err:
+        logger.warning_rank0(f"Cannot open {os.path.join(dataset_dir, DATA_CONFIG)} due to {str(err)}.")
+        return {}
+
+
+def load_args(config_path: str) -> dict[str, Any] | None:
+    r"""Load the training configuration from config path."""
+    try:
+        with open(config_path, encoding="utf-8") as f:
+            return safe_load(f)
+    except Exception:
+        return None
+
+
+def save_args(config_path: str, config_dict: dict[str, Any]) -> None:
+    r"""Save the training configuration to config path."""
+    with open(config_path, "w", encoding="utf-8") as f:
+        safe_dump(config_dict, f)
+
+
+def _clean_cmd(args: dict[str, Any]) -> dict[str, Any]:
+    r"""Remove args with NoneType or False or empty string value."""
+    no_skip_keys = [
+        "packing",
+        "enable_thinking",
+        "use_reentrant_gc",
+        "double_quantization",
+        "freeze_vision_tower",
+        "freeze_multi_modal_projector",
+    ]
+    return {k: v for k, v in args.items() if (k in no_skip_keys) or (v is not None and v is not False and v != "")}
+
+
+def gen_cmd(args: dict[str, Any]) -> str:
+    r"""Generate CLI commands for previewing."""
+    cmd_lines = ["llamafactory-cli train "]
+    for k, v in _clean_cmd(args).items():
+        if isinstance(v, dict):
+            cmd_lines.append(f"    --{k} {json.dumps(v, ensure_ascii=False)} ")
+        elif isinstance(v, list):
+            cmd_lines.append(f"    --{k} {' '.join(map(str, v))} ")
+        else:
+            cmd_lines.append(f"    --{k} {str(v)} ")
+
+    if os.name == "nt":
+        cmd_text = "`\n".join(cmd_lines)
+    else:
+        cmd_text = "\\\n".join(cmd_lines)
+
+    cmd_text = f"```bash\n{cmd_text}\n```"
+    return cmd_text
+
+
+def save_cmd(args: dict[str, Any]) -> str:
+    r"""Save CLI commands to launch training."""
+    output_dir = args["output_dir"]
+    os.makedirs(output_dir, exist_ok=True)
+    with open(os.path.join(output_dir, TRAINING_ARGS), "w", encoding="utf-8") as f:
+        safe_dump(_clean_cmd(args), f)
+
+    return os.path.join(output_dir, TRAINING_ARGS)
+
+
+def load_eval_results(path: os.PathLike) -> str:
+    r"""Get scores after evaluation."""
+    with open(path, encoding="utf-8") as f:
+        result = json.dumps(json.load(f), indent=4)
+
+    return f"```json\n{result}\n```\n"
+
+
+def calculate_pixels(pixels: str) -> int:
+    r"""Calculate the number of pixels from the expression."""
+    if "*" in pixels:
+        return int(pixels.split("*")[0]) * int(pixels.split("*")[1])
+    else:
+        return int(pixels)
+
+
+def create_ds_config() -> None:
+    r"""Create deepspeed config in the current directory."""
+    os.makedirs(DEFAULT_CACHE_DIR, exist_ok=True)
+    ds_config = {
+        "train_batch_size": "auto",
+        "train_micro_batch_size_per_gpu": "auto",
+        "gradient_accumulation_steps": "auto",
+        "gradient_clipping": "auto",
+        "zero_allow_untested_optimizer": True,
+        "fp16": {
+            "enabled": "auto",
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "initial_scale_power": 16,
+            "hysteresis": 2,
+            "min_loss_scale": 1,
+        },
+        "bf16": {"enabled": "auto"},
+    }
+    offload_config = {
+        "device": "cpu",
+        "pin_memory": True,
+    }
+    ds_config["zero_optimization"] = {
+        "stage": 2,
+        "allgather_partitions": True,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": False,
+        "reduce_scatter": True,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": True,
+        "round_robin_gradients": True,
+    }
+    with open(os.path.join(DEFAULT_CACHE_DIR, "ds_z2_config.json"), "w", encoding="utf-8") as f:
+        json.dump(ds_config, f, indent=2)
+
+    ds_config["zero_optimization"]["offload_optimizer"] = offload_config
+    with open(os.path.join(DEFAULT_CACHE_DIR, "ds_z2_offload_config.json"), "w", encoding="utf-8") as f:
+        json.dump(ds_config, f, indent=2)
+
+    ds_config["zero_optimization"] = {
+        "stage": 3,
+        "overlap_comm": False,
+        "contiguous_gradients": True,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": True,
+    }
+    with open(os.path.join(DEFAULT_CACHE_DIR, "ds_z3_config.json"), "w", encoding="utf-8") as f:
+        json.dump(ds_config, f, indent=2)
+
+    ds_config["zero_optimization"]["offload_optimizer"] = offload_config
+    ds_config["zero_optimization"]["offload_param"] = offload_config
+    with open(os.path.join(DEFAULT_CACHE_DIR, "ds_z3_offload_config.json"), "w", encoding="utf-8") as f:
+        json.dump(ds_config, f, indent=2)
diff --git a/LlamaFactory/src/llamafactory/webui/components/__init__.py b/LlamaFactory/src/llamafactory/webui/components/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2c64ea739b881b0908e476343ea55e59f56e373
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/components/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .chatbot import create_chat_box
+from .eval import create_eval_tab
+from .export import create_export_tab
+from .footer import create_footer
+from .infer import create_infer_tab
+from .top import create_top
+from .train import create_train_tab
+
+
+__all__ = [
+    "create_chat_box",
+    "create_eval_tab",
+    "create_export_tab",
+    "create_footer",
+    "create_infer_tab",
+    "create_top",
+    "create_train_tab",
+]
diff --git a/LlamaFactory/src/llamafactory/webui/components/chatbot.py b/LlamaFactory/src/llamafactory/webui/components/chatbot.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5ff5492cafbf183f113f399206c835c11f99623
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/components/chatbot.py
@@ -0,0 +1,143 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import json
+from typing import TYPE_CHECKING
+
+from ...data import Role
+from ...extras.packages import is_gradio_available
+from ..locales import ALERTS
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from ..engine import Engine
+
+
+def check_json_schema(text: str, lang: str) -> None:
+    r"""Check if the json schema is valid."""
+    try:
+        tools = json.loads(text)
+        if tools:
+            assert isinstance(tools, list)
+            for tool in tools:
+                if "name" not in tool:
+                    raise NotImplementedError("Name not found.")
+    except NotImplementedError:
+        gr.Warning(ALERTS["err_tool_name"][lang])
+    except Exception:
+        gr.Warning(ALERTS["err_json_schema"][lang])
+
+
+def create_chat_box(
+    engine: "Engine", visible: bool = False
+) -> tuple["Component", "Component", dict[str, "Component"]]:
+    lang = engine.manager.get_elem_by_id("top.lang")
+    with gr.Column(visible=visible) as chat_box:
+        kwargs = {}
+        if "show_copy_button" in inspect.signature(gr.Chatbot.__init__).parameters:
+            kwargs["show_copy_button"] = True
+
+        if "resizable" in inspect.signature(gr.Chatbot.__init__).parameters:
+            kwargs["resizable"] = True
+
+        chatbot = gr.Chatbot(type="messages", **kwargs)
+        messages = gr.State([])
+        with gr.Row():
+            with gr.Column(scale=4):
+                with gr.Row():
+                    with gr.Column():
+                        role = gr.Dropdown(choices=[Role.USER.value, Role.OBSERVATION.value], value=Role.USER.value)
+                        system = gr.Textbox(show_label=False)
+                        tools = gr.Textbox(show_label=False, lines=3)
+
+                    with gr.Column() as mm_box:
+                        with gr.Tab("Image"):
+                            image = gr.Image(type="pil")
+
+                        with gr.Tab("Video"):
+                            video = gr.Video()
+
+                        with gr.Tab("Audio"):
+                            audio = gr.Audio(type="filepath")
+
+                query = gr.Textbox(show_label=False, lines=8)
+                submit_btn = gr.Button(variant="primary")
+
+            with gr.Column(scale=1):
+                max_new_tokens = gr.Slider(minimum=8, maximum=8192, value=1024, step=1)
+                top_p = gr.Slider(minimum=0.01, maximum=1.0, value=0.7, step=0.01)
+                temperature = gr.Slider(minimum=0.01, maximum=1.5, value=0.95, step=0.01)
+                skip_special_tokens = gr.Checkbox(value=True)
+                escape_html = gr.Checkbox(value=True)
+                enable_thinking = gr.Checkbox(value=True)
+                clear_btn = gr.Button()
+
+    tools.input(check_json_schema, inputs=[tools, engine.manager.get_elem_by_id("top.lang")])
+
+    submit_btn.click(
+        engine.chatter.append,
+        [chatbot, messages, role, query, escape_html],
+        [chatbot, messages, query],
+    ).then(
+        engine.chatter.stream,
+        [
+            chatbot,
+            messages,
+            lang,
+            system,
+            tools,
+            image,
+            video,
+            audio,
+            max_new_tokens,
+            top_p,
+            temperature,
+            skip_special_tokens,
+            escape_html,
+            enable_thinking,
+        ],
+        [chatbot, messages],
+    )
+    clear_btn.click(lambda: ([], []), outputs=[chatbot, messages])
+
+    return (
+        chatbot,
+        messages,
+        dict(
+            chat_box=chat_box,
+            role=role,
+            system=system,
+            tools=tools,
+            mm_box=mm_box,
+            image=image,
+            video=video,
+            audio=audio,
+            query=query,
+            submit_btn=submit_btn,
+            max_new_tokens=max_new_tokens,
+            top_p=top_p,
+            temperature=temperature,
+            skip_special_tokens=skip_special_tokens,
+            escape_html=escape_html,
+            enable_thinking=enable_thinking,
+            clear_btn=clear_btn,
+        ),
+    )
diff --git a/LlamaFactory/src/llamafactory/webui/components/data.py b/LlamaFactory/src/llamafactory/webui/components/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f27bd19674735662f88ff53b267b772fcc2f44e
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/components/data.py
@@ -0,0 +1,122 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from typing import TYPE_CHECKING, Any
+
+from ...extras.constants import DATA_CONFIG
+from ...extras.packages import is_gradio_available
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+
+PAGE_SIZE = 2
+
+
+def prev_page(page_index: int) -> int:
+    return page_index - 1 if page_index > 0 else page_index
+
+
+def next_page(page_index: int, total_num: int) -> int:
+    return page_index + 1 if (page_index + 1) * PAGE_SIZE < total_num else page_index
+
+
+def can_preview(dataset_dir: str, dataset: list) -> "gr.Button":
+    r"""Check if the dataset is a local dataset."""
+    try:
+        with open(os.path.join(dataset_dir, DATA_CONFIG), encoding="utf-8") as f:
+            dataset_info = json.load(f)
+    except Exception:
+        return gr.Button(interactive=False)
+
+    if len(dataset) == 0 or "file_name" not in dataset_info[dataset[0]]:
+        return gr.Button(interactive=False)
+
+    data_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])
+    if os.path.isfile(data_path) or (os.path.isdir(data_path) and os.listdir(data_path)):
+        return gr.Button(interactive=True)
+    else:
+        return gr.Button(interactive=False)
+
+
+def _load_data_file(file_path: str) -> list[Any]:
+    with open(file_path, encoding="utf-8") as f:
+        if file_path.endswith(".json"):
+            return json.load(f)
+        elif file_path.endswith(".jsonl"):
+            return [json.loads(line) for line in f]
+        else:
+            return list(f)
+
+
+def get_preview(dataset_dir: str, dataset: list, page_index: int) -> tuple[int, list, "gr.Column"]:
+    r"""Get the preview samples from the dataset."""
+    with open(os.path.join(dataset_dir, DATA_CONFIG), encoding="utf-8") as f:
+        dataset_info = json.load(f)
+
+    data_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])
+    if os.path.isfile(data_path):
+        data = _load_data_file(data_path)
+    else:
+        data = []
+        for file_name in os.listdir(data_path):
+            data.extend(_load_data_file(os.path.join(data_path, file_name)))
+
+    return len(data), data[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)], gr.Column(visible=True)
+
+
+def create_preview_box(dataset_dir: "gr.Textbox", dataset: "gr.Dropdown") -> dict[str, "Component"]:
+    data_preview_btn = gr.Button(interactive=False, scale=1)
+    with gr.Column(visible=False, elem_classes="modal-box") as preview_box:
+        with gr.Row():
+            preview_count = gr.Number(value=0, interactive=False, precision=0)
+            page_index = gr.Number(value=0, interactive=False, precision=0)
+
+        with gr.Row():
+            prev_btn = gr.Button()
+            next_btn = gr.Button()
+            close_btn = gr.Button()
+
+        with gr.Row():
+            preview_samples = gr.JSON()
+
+    dataset.change(can_preview, [dataset_dir, dataset], [data_preview_btn], queue=False).then(
+        lambda: 0, outputs=[page_index], queue=False
+    )
+    data_preview_btn.click(
+        get_preview, [dataset_dir, dataset, page_index], [preview_count, preview_samples, preview_box], queue=False
+    )
+    prev_btn.click(prev_page, [page_index], [page_index], queue=False).then(
+        get_preview, [dataset_dir, dataset, page_index], [preview_count, preview_samples, preview_box], queue=False
+    )
+    next_btn.click(next_page, [page_index, preview_count], [page_index], queue=False).then(
+        get_preview, [dataset_dir, dataset, page_index], [preview_count, preview_samples, preview_box], queue=False
+    )
+    close_btn.click(lambda: gr.Column(visible=False), outputs=[preview_box], queue=False)
+    return dict(
+        data_preview_btn=data_preview_btn,
+        preview_count=preview_count,
+        page_index=page_index,
+        prev_btn=prev_btn,
+        next_btn=next_btn,
+        close_btn=close_btn,
+        preview_samples=preview_samples,
+    )
diff --git a/LlamaFactory/src/llamafactory/webui/components/eval.py b/LlamaFactory/src/llamafactory/webui/components/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..3804a77d6b1bd24e91e6d9fc16c0cea4846707fb
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/components/eval.py
@@ -0,0 +1,94 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...extras.packages import is_gradio_available
+from ..common import DEFAULT_DATA_DIR
+from ..control import list_datasets
+from .data import create_preview_box
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from ..engine import Engine
+
+
+def create_eval_tab(engine: "Engine") -> dict[str, "Component"]:
+    input_elems = engine.manager.get_base_elems()
+    elem_dict = dict()
+
+    with gr.Row():
+        dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=2)
+        dataset = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=4)
+        preview_elems = create_preview_box(dataset_dir, dataset)
+
+    input_elems.update({dataset_dir, dataset})
+    elem_dict.update(dict(dataset_dir=dataset_dir, dataset=dataset, **preview_elems))
+
+    with gr.Row():
+        cutoff_len = gr.Slider(minimum=4, maximum=131072, value=1024, step=1)
+        max_samples = gr.Textbox(value="100000")
+        batch_size = gr.Slider(minimum=1, maximum=1024, value=2, step=1)
+        predict = gr.Checkbox(value=True)
+
+    input_elems.update({cutoff_len, max_samples, batch_size, predict})
+    elem_dict.update(dict(cutoff_len=cutoff_len, max_samples=max_samples, batch_size=batch_size, predict=predict))
+
+    with gr.Row():
+        max_new_tokens = gr.Slider(minimum=8, maximum=4096, value=512, step=1)
+        top_p = gr.Slider(minimum=0.01, maximum=1, value=0.7, step=0.01)
+        temperature = gr.Slider(minimum=0.01, maximum=1.5, value=0.95, step=0.01)
+        output_dir = gr.Textbox()
+
+    input_elems.update({max_new_tokens, top_p, temperature, output_dir})
+    elem_dict.update(dict(max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature, output_dir=output_dir))
+
+    with gr.Row():
+        cmd_preview_btn = gr.Button()
+        start_btn = gr.Button(variant="primary")
+        stop_btn = gr.Button(variant="stop")
+
+    with gr.Row():
+        resume_btn = gr.Checkbox(visible=False, interactive=False)
+        progress_bar = gr.Slider(visible=False, interactive=False)
+
+    with gr.Row():
+        output_box = gr.Markdown()
+
+    elem_dict.update(
+        dict(
+            cmd_preview_btn=cmd_preview_btn,
+            start_btn=start_btn,
+            stop_btn=stop_btn,
+            resume_btn=resume_btn,
+            progress_bar=progress_bar,
+            output_box=output_box,
+        )
+    )
+    output_elems = [output_box, progress_bar]
+
+    cmd_preview_btn.click(engine.runner.preview_eval, input_elems, output_elems, concurrency_limit=None)
+    start_btn.click(engine.runner.run_eval, input_elems, output_elems)
+    stop_btn.click(engine.runner.set_abort)
+    resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
+
+    dataset.focus(list_datasets, [dataset_dir], [dataset], queue=False)
+
+    return elem_dict
diff --git a/LlamaFactory/src/llamafactory/webui/components/export.py b/LlamaFactory/src/llamafactory/webui/components/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..9597aa61b8c5bab286200d1d235e50b655518faa
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/components/export.py
@@ -0,0 +1,169 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from collections.abc import Generator
+from typing import TYPE_CHECKING
+
+from ...extras.constants import PEFT_METHODS
+from ...extras.misc import torch_gc
+from ...extras.packages import is_gradio_available
+from ...train.tuner import export_model
+from ..common import get_save_dir, load_config
+from ..locales import ALERTS
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from ..engine import Engine
+
+
+GPTQ_BITS = ["8", "4", "3", "2"]
+
+
+def can_quantize(checkpoint_path: str | list[str]) -> "gr.Dropdown":
+    if isinstance(checkpoint_path, list) and len(checkpoint_path) != 0:
+        return gr.Dropdown(value="none", interactive=False)
+    else:
+        return gr.Dropdown(interactive=True)
+
+
+def save_model(
+    lang: str,
+    model_name: str,
+    model_path: str,
+    finetuning_type: str,
+    checkpoint_path: str | list[str],
+    template: str,
+    export_size: int,
+    export_quantization_bit: str,
+    export_quantization_dataset: str,
+    export_device: str,
+    export_legacy_format: bool,
+    export_dir: str,
+    export_hub_model_id: str,
+    extra_args: str,
+) -> Generator[str, None, None]:
+    user_config = load_config()
+    error = ""
+    if not model_name:
+        error = ALERTS["err_no_model"][lang]
+    elif not model_path:
+        error = ALERTS["err_no_path"][lang]
+    elif not export_dir:
+        error = ALERTS["err_no_export_dir"][lang]
+    elif export_quantization_bit in GPTQ_BITS and not export_quantization_dataset:
+        error = ALERTS["err_no_dataset"][lang]
+    elif export_quantization_bit not in GPTQ_BITS and not checkpoint_path:
+        error = ALERTS["err_no_adapter"][lang]
+    elif export_quantization_bit in GPTQ_BITS and checkpoint_path and isinstance(checkpoint_path, list):
+        error = ALERTS["err_gptq_lora"][lang]
+
+    try:
+        json.loads(extra_args)
+    except json.JSONDecodeError:
+        error = ALERTS["err_json_schema"][lang]
+
+    if error:
+        gr.Warning(error)
+        yield error
+        return
+
+    args = dict(
+        model_name_or_path=model_path,
+        cache_dir=user_config.get("cache_dir", None),
+        finetuning_type=finetuning_type,
+        template=template,
+        export_dir=export_dir,
+        export_hub_model_id=export_hub_model_id or None,
+        export_size=export_size,
+        export_quantization_bit=int(export_quantization_bit) if export_quantization_bit in GPTQ_BITS else None,
+        export_quantization_dataset=export_quantization_dataset,
+        export_device=export_device,
+        export_legacy_format=export_legacy_format,
+        trust_remote_code=True,
+    )
+    args.update(json.loads(extra_args))
+
+    if checkpoint_path:
+        if finetuning_type in PEFT_METHODS:  # list
+            args["adapter_name_or_path"] = ",".join(
+                [get_save_dir(model_name, finetuning_type, adapter) for adapter in checkpoint_path]
+            )
+        else:  # str
+            args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, checkpoint_path)
+
+    yield ALERTS["info_exporting"][lang]
+    export_model(args)
+    torch_gc()
+    yield ALERTS["info_exported"][lang]
+
+
+def create_export_tab(engine: "Engine") -> dict[str, "Component"]:
+    with gr.Row():
+        export_size = gr.Slider(minimum=1, maximum=100, value=5, step=1)
+        export_quantization_bit = gr.Dropdown(choices=["none"] + GPTQ_BITS, value="none")
+        export_quantization_dataset = gr.Textbox(value="data/c4_demo.jsonl")
+        export_device = gr.Radio(choices=["cpu", "auto"], value="cpu")
+        export_legacy_format = gr.Checkbox()
+
+    with gr.Row():
+        export_dir = gr.Textbox()
+        export_hub_model_id = gr.Textbox()
+        extra_args = gr.Textbox(value="{}")
+
+    checkpoint_path: gr.Dropdown = engine.manager.get_elem_by_id("top.checkpoint_path")
+    checkpoint_path.change(can_quantize, [checkpoint_path], [export_quantization_bit], queue=False)
+
+    export_btn = gr.Button()
+    info_box = gr.Textbox(show_label=False, interactive=False)
+
+    export_btn.click(
+        save_model,
+        [
+            engine.manager.get_elem_by_id("top.lang"),
+            engine.manager.get_elem_by_id("top.model_name"),
+            engine.manager.get_elem_by_id("top.model_path"),
+            engine.manager.get_elem_by_id("top.finetuning_type"),
+            engine.manager.get_elem_by_id("top.checkpoint_path"),
+            engine.manager.get_elem_by_id("top.template"),
+            export_size,
+            export_quantization_bit,
+            export_quantization_dataset,
+            export_device,
+            export_legacy_format,
+            export_dir,
+            export_hub_model_id,
+            extra_args,
+        ],
+        [info_box],
+    )
+
+    return dict(
+        export_size=export_size,
+        export_quantization_bit=export_quantization_bit,
+        export_quantization_dataset=export_quantization_dataset,
+        export_device=export_device,
+        export_legacy_format=export_legacy_format,
+        export_dir=export_dir,
+        export_hub_model_id=export_hub_model_id,
+        extra_args=extra_args,
+        export_btn=export_btn,
+        info_box=info_box,
+    )
diff --git a/LlamaFactory/src/llamafactory/webui/components/footer.py b/LlamaFactory/src/llamafactory/webui/components/footer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ee9bbce4b9f3ac24e48f911d27b78577f1f821c
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/components/footer.py
@@ -0,0 +1,45 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...extras.misc import get_current_memory
+from ...extras.packages import is_gradio_available
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+
+def get_device_memory() -> "gr.Slider":
+    free, total = get_current_memory()
+    if total != -1:
+        used = round((total - free) / (1024**3), 2)
+        total = round(total / (1024**3), 2)
+        return gr.Slider(minimum=0, maximum=total, value=used, step=0.01, visible=True)
+    else:
+        return gr.Slider(visible=False)
+
+
+def create_footer() -> dict[str, "Component"]:
+    with gr.Row():
+        device_memory = gr.Slider(visible=False, interactive=False)
+        timer = gr.Timer(value=5)
+
+    timer.tick(get_device_memory, outputs=[device_memory], queue=False)
+    return dict(device_memory=device_memory)
diff --git a/LlamaFactory/src/llamafactory/webui/components/infer.py b/LlamaFactory/src/llamafactory/webui/components/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef508cdf6eeff3d8f541100257c602b596161c6c
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/components/infer.py
@@ -0,0 +1,76 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...extras.packages import is_gradio_available
+from ..common import is_multimodal
+from .chatbot import create_chat_box
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from ..engine import Engine
+
+
+def create_infer_tab(engine: "Engine") -> dict[str, "Component"]:
+    input_elems = engine.manager.get_base_elems()
+    elem_dict = dict()
+
+    with gr.Row():
+        infer_backend = gr.Dropdown(choices=["huggingface", "vllm", "sglang"], value="huggingface")
+        infer_dtype = gr.Dropdown(choices=["auto", "float16", "bfloat16", "float32"], value="auto")
+        extra_args = gr.Textbox(value='{"vllm_enforce_eager": true}')
+
+    with gr.Row():
+        load_btn = gr.Button()
+        unload_btn = gr.Button()
+
+    info_box = gr.Textbox(show_label=False, interactive=False)
+
+    input_elems.update({infer_backend, infer_dtype, extra_args})
+    elem_dict.update(
+        dict(
+            infer_backend=infer_backend,
+            infer_dtype=infer_dtype,
+            extra_args=extra_args,
+            load_btn=load_btn,
+            unload_btn=unload_btn,
+            info_box=info_box,
+        )
+    )
+
+    chatbot, messages, chat_elems = create_chat_box(engine, visible=False)
+    elem_dict.update(chat_elems)
+
+    load_btn.click(engine.chatter.load_model, input_elems, [info_box]).then(
+        lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_elems["chat_box"]]
+    )
+
+    unload_btn.click(engine.chatter.unload_model, input_elems, [info_box]).then(
+        lambda: ([], []), outputs=[chatbot, messages]
+    ).then(lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_elems["chat_box"]])
+
+    engine.manager.get_elem_by_id("top.model_name").change(
+        lambda model_name: gr.Column(visible=is_multimodal(model_name)),
+        [engine.manager.get_elem_by_id("top.model_name")],
+        [chat_elems["mm_box"]],
+    )
+
+    return elem_dict
diff --git a/LlamaFactory/src/llamafactory/webui/components/top.py b/LlamaFactory/src/llamafactory/webui/components/top.py
new file mode 100644
index 0000000000000000000000000000000000000000..12275f1610acd89e2df6a765ca449c023aae38f3
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/components/top.py
@@ -0,0 +1,82 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...data import TEMPLATES
+from ...extras.constants import METHODS, SUPPORTED_MODELS
+from ...extras.misc import use_modelscope, use_openmind
+from ...extras.packages import is_gradio_available
+from ..common import save_config
+from ..control import can_quantize, can_quantize_to, check_template, get_model_info, list_checkpoints, switch_hub
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+
+def create_top() -> dict[str, "Component"]:
+    with gr.Row():
+        lang = gr.Dropdown(choices=["en", "ru", "zh", "ko", "ja"], value=None, scale=1)
+        available_models = list(SUPPORTED_MODELS.keys()) + ["Custom"]
+        model_name = gr.Dropdown(choices=available_models, value=None, scale=2)
+        model_path = gr.Textbox(scale=2)
+        default_hub = "modelscope" if use_modelscope() else "openmind" if use_openmind() else "huggingface"
+        hub_name = gr.Dropdown(choices=["huggingface", "modelscope", "openmind"], value=default_hub, scale=2)
+
+    with gr.Row():
+        finetuning_type = gr.Dropdown(choices=METHODS, value="lora", scale=1)
+        checkpoint_path = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=6)
+
+    with gr.Row():
+        quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", allow_custom_value=True)
+        quantization_method = gr.Dropdown(choices=["bnb", "hqq", "eetq"], value="bnb")
+        template = gr.Dropdown(choices=list(TEMPLATES.keys()), value="default")
+        rope_scaling = gr.Dropdown(choices=["none", "linear", "dynamic", "yarn", "llama3"], value="none")
+        booster = gr.Dropdown(choices=["auto", "flashattn2", "unsloth", "liger_kernel"], value="auto")
+
+    model_name.change(get_model_info, [model_name], [model_path, template], queue=False).then(
+        list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
+    ).then(check_template, [lang, template])
+    model_name.input(save_config, inputs=[lang, hub_name, model_name], queue=False)
+    model_path.input(save_config, inputs=[lang, hub_name, model_name, model_path], queue=False)
+    finetuning_type.change(can_quantize, [finetuning_type], [quantization_bit], queue=False).then(
+        list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
+    )
+    checkpoint_path.focus(list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False)
+    quantization_method.change(can_quantize_to, [quantization_method], [quantization_bit], queue=False)
+    hub_name.change(switch_hub, inputs=[hub_name], queue=False).then(
+        get_model_info, [model_name], [model_path, template], queue=False
+    ).then(list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False).then(
+        check_template, [lang, template]
+    )
+    hub_name.input(save_config, inputs=[lang, hub_name], queue=False)
+
+    return dict(
+        lang=lang,
+        model_name=model_name,
+        model_path=model_path,
+        hub_name=hub_name,
+        finetuning_type=finetuning_type,
+        checkpoint_path=checkpoint_path,
+        quantization_bit=quantization_bit,
+        quantization_method=quantization_method,
+        template=template,
+        rope_scaling=rope_scaling,
+        booster=booster,
+    )
diff --git a/LlamaFactory/src/llamafactory/webui/components/train.py b/LlamaFactory/src/llamafactory/webui/components/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b7aa6e946f6c9fc5aaea4cab053a44430032d61
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/components/train.py
@@ -0,0 +1,447 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.trainer_utils import SchedulerType
+
+from ...extras.constants import TRAINING_STAGES
+from ...extras.misc import get_device_count
+from ...extras.packages import is_gradio_available
+from ..common import DEFAULT_DATA_DIR
+from ..control import change_stage, list_checkpoints, list_config_paths, list_datasets, list_output_dirs
+from .data import create_preview_box
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from ..engine import Engine
+
+
+def create_train_tab(engine: "Engine") -> dict[str, "Component"]:
+    input_elems = engine.manager.get_base_elems()
+    elem_dict = dict()
+
+    with gr.Row():
+        stages = list(TRAINING_STAGES.keys())
+        training_stage = gr.Dropdown(choices=stages, value=stages[0], scale=1)
+        dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=1)
+        dataset = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=4)
+        preview_elems = create_preview_box(dataset_dir, dataset)
+
+    input_elems.update({training_stage, dataset_dir, dataset})
+    elem_dict.update(dict(training_stage=training_stage, dataset_dir=dataset_dir, dataset=dataset, **preview_elems))
+
+    with gr.Row():
+        learning_rate = gr.Textbox(value="5e-5")
+        num_train_epochs = gr.Textbox(value="3.0")
+        max_grad_norm = gr.Textbox(value="1.0")
+        max_samples = gr.Textbox(value="100000")
+        compute_type = gr.Dropdown(choices=["bf16", "fp16", "fp32", "pure_bf16"], value="bf16")
+
+    input_elems.update({learning_rate, num_train_epochs, max_grad_norm, max_samples, compute_type})
+    elem_dict.update(
+        dict(
+            learning_rate=learning_rate,
+            num_train_epochs=num_train_epochs,
+            max_grad_norm=max_grad_norm,
+            max_samples=max_samples,
+            compute_type=compute_type,
+        )
+    )
+
+    with gr.Row():
+        cutoff_len = gr.Slider(minimum=4, maximum=131072, value=2048, step=1)
+        batch_size = gr.Slider(minimum=1, maximum=1024, value=2, step=1)
+        gradient_accumulation_steps = gr.Slider(minimum=1, maximum=1024, value=8, step=1)
+        val_size = gr.Slider(minimum=0, maximum=1, value=0, step=0.001)
+        lr_scheduler_type = gr.Dropdown(choices=[scheduler.value for scheduler in SchedulerType], value="cosine")
+
+    input_elems.update({cutoff_len, batch_size, gradient_accumulation_steps, val_size, lr_scheduler_type})
+    elem_dict.update(
+        dict(
+            cutoff_len=cutoff_len,
+            batch_size=batch_size,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            val_size=val_size,
+            lr_scheduler_type=lr_scheduler_type,
+        )
+    )
+
+    with gr.Accordion(open=False) as extra_tab:
+        with gr.Row():
+            logging_steps = gr.Slider(minimum=1, maximum=1000, value=5, step=5)
+            save_steps = gr.Slider(minimum=10, maximum=5000, value=100, step=10)
+            warmup_steps = gr.Slider(minimum=0, maximum=5000, value=0, step=1)
+            neftune_alpha = gr.Slider(minimum=0, maximum=10, value=0, step=0.1)
+            extra_args = gr.Textbox(value='{"optim": "adamw_torch"}')
+
+        with gr.Row():
+            with gr.Column():
+                packing = gr.Checkbox()
+                neat_packing = gr.Checkbox()
+
+            with gr.Column():
+                train_on_prompt = gr.Checkbox()
+                mask_history = gr.Checkbox()
+
+            with gr.Column():
+                resize_vocab = gr.Checkbox()
+                use_llama_pro = gr.Checkbox()
+
+            with gr.Column():
+                enable_thinking = gr.Checkbox(value=True)
+                report_to = gr.Dropdown(
+                    choices=["none", "wandb", "mlflow", "neptune", "tensorboard", "all"],
+                    value="none",
+                    allow_custom_value=True,
+                )
+
+    input_elems.update(
+        {
+            logging_steps,
+            save_steps,
+            warmup_steps,
+            neftune_alpha,
+            extra_args,
+            packing,
+            neat_packing,
+            train_on_prompt,
+            mask_history,
+            resize_vocab,
+            use_llama_pro,
+            enable_thinking,
+            report_to,
+        }
+    )
+    elem_dict.update(
+        dict(
+            extra_tab=extra_tab,
+            logging_steps=logging_steps,
+            save_steps=save_steps,
+            warmup_steps=warmup_steps,
+            neftune_alpha=neftune_alpha,
+            extra_args=extra_args,
+            packing=packing,
+            neat_packing=neat_packing,
+            train_on_prompt=train_on_prompt,
+            mask_history=mask_history,
+            resize_vocab=resize_vocab,
+            use_llama_pro=use_llama_pro,
+            enable_thinking=enable_thinking,
+            report_to=report_to,
+        )
+    )
+
+    with gr.Accordion(open=False) as freeze_tab:
+        with gr.Row():
+            freeze_trainable_layers = gr.Slider(minimum=-128, maximum=128, value=2, step=1)
+            freeze_trainable_modules = gr.Textbox(value="all")
+            freeze_extra_modules = gr.Textbox()
+
+    input_elems.update({freeze_trainable_layers, freeze_trainable_modules, freeze_extra_modules})
+    elem_dict.update(
+        dict(
+            freeze_tab=freeze_tab,
+            freeze_trainable_layers=freeze_trainable_layers,
+            freeze_trainable_modules=freeze_trainable_modules,
+            freeze_extra_modules=freeze_extra_modules,
+        )
+    )
+
+    with gr.Accordion(open=False) as lora_tab:
+        with gr.Row():
+            lora_rank = gr.Slider(minimum=1, maximum=1024, value=8, step=1)
+            lora_alpha = gr.Slider(minimum=1, maximum=2048, value=16, step=1)
+            lora_dropout = gr.Slider(minimum=0, maximum=1, value=0, step=0.01)
+            loraplus_lr_ratio = gr.Slider(minimum=0, maximum=64, value=0, step=0.01)
+            create_new_adapter = gr.Checkbox()
+
+        with gr.Row():
+            use_rslora = gr.Checkbox()
+            use_dora = gr.Checkbox()
+            use_pissa = gr.Checkbox()
+            lora_target = gr.Textbox(scale=2)
+            additional_target = gr.Textbox(scale=2)
+
+    input_elems.update(
+        {
+            lora_rank,
+            lora_alpha,
+            lora_dropout,
+            loraplus_lr_ratio,
+            create_new_adapter,
+            use_rslora,
+            use_dora,
+            use_pissa,
+            lora_target,
+            additional_target,
+        }
+    )
+    elem_dict.update(
+        dict(
+            lora_tab=lora_tab,
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            loraplus_lr_ratio=loraplus_lr_ratio,
+            create_new_adapter=create_new_adapter,
+            use_rslora=use_rslora,
+            use_dora=use_dora,
+            use_pissa=use_pissa,
+            lora_target=lora_target,
+            additional_target=additional_target,
+        )
+    )
+
+    with gr.Accordion(open=False) as rlhf_tab:
+        with gr.Row():
+            pref_beta = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.01)
+            pref_ftx = gr.Slider(minimum=0, maximum=10, value=0, step=0.01)
+            pref_loss = gr.Dropdown(choices=["sigmoid", "hinge", "ipo", "kto_pair", "orpo", "simpo"], value="sigmoid")
+            reward_model = gr.Dropdown(multiselect=True, allow_custom_value=True)
+            with gr.Column():
+                ppo_score_norm = gr.Checkbox()
+                ppo_whiten_rewards = gr.Checkbox()
+
+    input_elems.update({pref_beta, pref_ftx, pref_loss, reward_model, ppo_score_norm, ppo_whiten_rewards})
+    elem_dict.update(
+        dict(
+            rlhf_tab=rlhf_tab,
+            pref_beta=pref_beta,
+            pref_ftx=pref_ftx,
+            pref_loss=pref_loss,
+            reward_model=reward_model,
+            ppo_score_norm=ppo_score_norm,
+            ppo_whiten_rewards=ppo_whiten_rewards,
+        )
+    )
+
+    with gr.Accordion(open=False) as mm_tab:
+        with gr.Row():
+            freeze_vision_tower = gr.Checkbox(value=True)
+            freeze_multi_modal_projector = gr.Checkbox(value=True)
+            freeze_language_model = gr.Checkbox(value=False)
+
+        with gr.Row():
+            image_max_pixels = gr.Textbox(value="768*768")
+            image_min_pixels = gr.Textbox(value="32*32")
+            video_max_pixels = gr.Textbox(value="256*256")
+            video_min_pixels = gr.Textbox(value="16*16")
+
+    input_elems.update(
+        {
+            freeze_vision_tower,
+            freeze_multi_modal_projector,
+            freeze_language_model,
+            image_max_pixels,
+            image_min_pixels,
+            video_max_pixels,
+            video_min_pixels,
+        }
+    )
+    elem_dict.update(
+        dict(
+            mm_tab=mm_tab,
+            freeze_vision_tower=freeze_vision_tower,
+            freeze_multi_modal_projector=freeze_multi_modal_projector,
+            freeze_language_model=freeze_language_model,
+            image_max_pixels=image_max_pixels,
+            image_min_pixels=image_min_pixels,
+            video_max_pixels=video_max_pixels,
+            video_min_pixels=video_min_pixels,
+        )
+    )
+
+    with gr.Accordion(open=False) as galore_tab:
+        with gr.Row():
+            use_galore = gr.Checkbox()
+            galore_rank = gr.Slider(minimum=1, maximum=1024, value=16, step=1)
+            galore_update_interval = gr.Slider(minimum=1, maximum=2048, value=200, step=1)
+            galore_scale = gr.Slider(minimum=0, maximum=100, value=2.0, step=0.1)
+            galore_target = gr.Textbox(value="all")
+
+    input_elems.update({use_galore, galore_rank, galore_update_interval, galore_scale, galore_target})
+    elem_dict.update(
+        dict(
+            galore_tab=galore_tab,
+            use_galore=use_galore,
+            galore_rank=galore_rank,
+            galore_update_interval=galore_update_interval,
+            galore_scale=galore_scale,
+            galore_target=galore_target,
+        )
+    )
+
+    with gr.Accordion(open=False) as apollo_tab:
+        with gr.Row():
+            use_apollo = gr.Checkbox()
+            apollo_rank = gr.Slider(minimum=1, maximum=1024, value=16, step=1)
+            apollo_update_interval = gr.Slider(minimum=1, maximum=2048, value=200, step=1)
+            apollo_scale = gr.Slider(minimum=0, maximum=100, value=32.0, step=0.1)
+            apollo_target = gr.Textbox(value="all")
+
+    input_elems.update({use_apollo, apollo_rank, apollo_update_interval, apollo_scale, apollo_target})
+    elem_dict.update(
+        dict(
+            apollo_tab=apollo_tab,
+            use_apollo=use_apollo,
+            apollo_rank=apollo_rank,
+            apollo_update_interval=apollo_update_interval,
+            apollo_scale=apollo_scale,
+            apollo_target=apollo_target,
+        )
+    )
+
+    with gr.Accordion(open=False) as badam_tab:
+        with gr.Row():
+            use_badam = gr.Checkbox()
+            badam_mode = gr.Dropdown(choices=["layer", "ratio"], value="layer")
+            badam_switch_mode = gr.Dropdown(choices=["ascending", "descending", "random", "fixed"], value="ascending")
+            badam_switch_interval = gr.Slider(minimum=1, maximum=1024, value=50, step=1)
+            badam_update_ratio = gr.Slider(minimum=0, maximum=1, value=0.05, step=0.01)
+
+    input_elems.update({use_badam, badam_mode, badam_switch_mode, badam_switch_interval, badam_update_ratio})
+    elem_dict.update(
+        dict(
+            badam_tab=badam_tab,
+            use_badam=use_badam,
+            badam_mode=badam_mode,
+            badam_switch_mode=badam_switch_mode,
+            badam_switch_interval=badam_switch_interval,
+            badam_update_ratio=badam_update_ratio,
+        )
+    )
+
+    with gr.Accordion(open=False) as swanlab_tab:
+        with gr.Row():
+            use_swanlab = gr.Checkbox()
+            swanlab_project = gr.Textbox(value="llamafactory")
+            swanlab_run_name = gr.Textbox()
+            swanlab_workspace = gr.Textbox()
+            swanlab_api_key = gr.Textbox()
+            swanlab_mode = gr.Dropdown(choices=["cloud", "local"], value="cloud")
+            swanlab_link = gr.Markdown(visible=False)
+
+    input_elems.update(
+        {
+            use_swanlab,
+            swanlab_project,
+            swanlab_run_name,
+            swanlab_workspace,
+            swanlab_api_key,
+            swanlab_mode,
+            swanlab_link,
+        }
+    )
+    elem_dict.update(
+        dict(
+            swanlab_tab=swanlab_tab,
+            use_swanlab=use_swanlab,
+            swanlab_project=swanlab_project,
+            swanlab_run_name=swanlab_run_name,
+            swanlab_workspace=swanlab_workspace,
+            swanlab_api_key=swanlab_api_key,
+            swanlab_mode=swanlab_mode,
+            swanlab_link=swanlab_link,
+        )
+    )
+
+    with gr.Row():
+        cmd_preview_btn = gr.Button()
+        arg_save_btn = gr.Button()
+        arg_load_btn = gr.Button()
+        start_btn = gr.Button(variant="primary")
+        stop_btn = gr.Button(variant="stop")
+
+    with gr.Row():
+        with gr.Column(scale=3):
+            with gr.Row():
+                current_time = gr.Textbox(visible=False, interactive=False)
+                output_dir = gr.Dropdown(allow_custom_value=True)
+                config_path = gr.Dropdown(allow_custom_value=True)
+
+            with gr.Row():
+                device_count = gr.Textbox(value=str(get_device_count() or 1), interactive=False)
+                ds_stage = gr.Dropdown(choices=["none", "2", "3"], value="none")
+                ds_offload = gr.Checkbox()
+
+            with gr.Row():
+                resume_btn = gr.Checkbox(visible=False, interactive=False)
+                progress_bar = gr.Slider(visible=False, interactive=False)
+
+            with gr.Row():
+                output_box = gr.Markdown()
+
+        with gr.Column(scale=1):
+            loss_viewer = gr.Plot()
+
+    input_elems.update({output_dir, config_path, ds_stage, ds_offload})
+    elem_dict.update(
+        dict(
+            cmd_preview_btn=cmd_preview_btn,
+            arg_save_btn=arg_save_btn,
+            arg_load_btn=arg_load_btn,
+            start_btn=start_btn,
+            stop_btn=stop_btn,
+            current_time=current_time,
+            output_dir=output_dir,
+            config_path=config_path,
+            device_count=device_count,
+            ds_stage=ds_stage,
+            ds_offload=ds_offload,
+            resume_btn=resume_btn,
+            progress_bar=progress_bar,
+            output_box=output_box,
+            loss_viewer=loss_viewer,
+        )
+    )
+    output_elems = [output_box, progress_bar, loss_viewer, swanlab_link]
+
+    cmd_preview_btn.click(engine.runner.preview_train, input_elems, output_elems, concurrency_limit=None)
+    start_btn.click(engine.runner.run_train, input_elems, output_elems)
+    stop_btn.click(engine.runner.set_abort)
+    resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
+
+    lang = engine.manager.get_elem_by_id("top.lang")
+    model_name: gr.Dropdown = engine.manager.get_elem_by_id("top.model_name")
+    finetuning_type: gr.Dropdown = engine.manager.get_elem_by_id("top.finetuning_type")
+
+    arg_save_btn.click(engine.runner.save_args, input_elems, output_elems, concurrency_limit=None)
+    arg_load_btn.click(
+        engine.runner.load_args, [lang, config_path], list(input_elems) + [output_box], concurrency_limit=None
+    )
+
+    dataset.focus(list_datasets, [dataset_dir, training_stage], [dataset], queue=False)
+    training_stage.change(change_stage, [training_stage], [dataset, packing], queue=False)
+    reward_model.focus(list_checkpoints, [model_name, finetuning_type], [reward_model], queue=False)
+    model_name.change(list_output_dirs, [model_name, finetuning_type, current_time], [output_dir], queue=False)
+    finetuning_type.change(list_output_dirs, [model_name, finetuning_type, current_time], [output_dir], queue=False)
+    output_dir.change(
+        list_output_dirs, [model_name, finetuning_type, current_time], [output_dir], concurrency_limit=None
+    )
+    output_dir.input(
+        engine.runner.check_output_dir,
+        [lang, model_name, finetuning_type, output_dir],
+        list(input_elems) + [output_box],
+        concurrency_limit=None,
+    )
+    config_path.change(list_config_paths, [current_time], [config_path], queue=False)
+
+    return elem_dict
diff --git a/LlamaFactory/src/llamafactory/webui/control.py b/LlamaFactory/src/llamafactory/webui/control.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec99f4079f84f523430d6d911a819b1d35409ba4
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/control.py
@@ -0,0 +1,224 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from typing import Any
+
+from transformers.trainer_utils import get_last_checkpoint
+
+from ..extras.constants import (
+    CHECKPOINT_NAMES,
+    PEFT_METHODS,
+    RUNNING_LOG,
+    STAGES_USE_PAIR_DATA,
+    SWANLAB_CONFIG,
+    TRAINER_LOG,
+    TRAINING_STAGES,
+)
+from ..extras.packages import is_gradio_available, is_matplotlib_available
+from ..extras.ploting import gen_loss_plot
+from ..model import QuantizationMethod
+from .common import DEFAULT_CONFIG_DIR, DEFAULT_DATA_DIR, get_model_path, get_save_dir, get_template, load_dataset_info
+from .locales import ALERTS
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+def switch_hub(hub_name: str) -> None:
+    r"""Switch model hub.
+
+    Inputs: top.hub_name
+    """
+    os.environ["USE_MODELSCOPE_HUB"] = "1" if hub_name == "modelscope" else "0"
+    os.environ["USE_OPENMIND_HUB"] = "1" if hub_name == "openmind" else "0"
+
+
+def can_quantize(finetuning_type: str) -> "gr.Dropdown":
+    r"""Judge if the quantization is available in this finetuning type.
+
+    Inputs: top.finetuning_type
+    Outputs: top.quantization_bit
+    """
+    if finetuning_type not in PEFT_METHODS:
+        return gr.Dropdown(value="none", interactive=False)
+    else:
+        return gr.Dropdown(interactive=True)
+
+
+def can_quantize_to(quantization_method: str) -> "gr.Dropdown":
+    r"""Get the available quantization bits.
+
+    Inputs: top.quantization_method
+    Outputs: top.quantization_bit
+    """
+    if quantization_method == QuantizationMethod.BNB:
+        available_bits = ["none", "8", "4"]
+    elif quantization_method == QuantizationMethod.HQQ:
+        available_bits = ["none", "8", "6", "5", "4", "3", "2", "1"]
+    elif quantization_method == QuantizationMethod.EETQ:
+        available_bits = ["none", "8"]
+
+    return gr.Dropdown(choices=available_bits)
+
+
+def change_stage(training_stage: str = list(TRAINING_STAGES.keys())[0]) -> tuple[list[str], bool]:
+    r"""Modify states after changing the training stage.
+
+    Inputs: train.training_stage
+    Outputs: train.dataset, train.packing
+    """
+    return [], TRAINING_STAGES[training_stage] == "pt"
+
+
+def get_model_info(model_name: str) -> tuple[str, str]:
+    r"""Get the necessary information of this model.
+
+    Inputs: top.model_name
+    Outputs: top.model_path, top.template
+    """
+    return get_model_path(model_name), get_template(model_name)
+
+
+def check_template(lang: str, template: str) -> None:
+    r"""Check if an instruct model is used.
+
+    Please use queue=True to show the warning message.
+
+    Inputs: top.lang, top.template
+    """
+    if template == "default":
+        gr.Warning(ALERTS["warn_no_instruct"][lang])
+
+
+def get_trainer_info(lang: str, output_path: os.PathLike, do_train: bool) -> tuple[str, "gr.Slider", dict[str, Any]]:
+    r"""Get training infomation for monitor.
+
+    If do_train is True:
+        Inputs: top.lang, train.output_path
+        Outputs: train.output_box, train.progress_bar, train.loss_viewer, train.swanlab_link
+    If do_train is False:
+        Inputs: top.lang, eval.output_path
+        Outputs: eval.output_box, eval.progress_bar, None, None
+    """
+    running_log = ""
+    running_progress = gr.Slider(visible=False)
+    running_info = {}
+
+    running_log_path = os.path.join(output_path, RUNNING_LOG)
+    if os.path.isfile(running_log_path):
+        with open(running_log_path, encoding="utf-8") as f:
+            running_log = "```\n" + f.read()[-20000:] + "\n```\n"  # avoid lengthy log
+
+    trainer_log_path = os.path.join(output_path, TRAINER_LOG)
+    if os.path.isfile(trainer_log_path):
+        trainer_log: list[dict[str, Any]] = []
+        with open(trainer_log_path, encoding="utf-8") as f:
+            for line in f:
+                trainer_log.append(json.loads(line))
+
+        if len(trainer_log) != 0:
+            latest_log = trainer_log[-1]
+            percentage = latest_log["percentage"]
+            label = "Running {:d}/{:d}: {} < {}".format(
+                latest_log["current_steps"],
+                latest_log["total_steps"],
+                latest_log["elapsed_time"],
+                latest_log["remaining_time"],
+            )
+            running_progress = gr.Slider(label=label, value=percentage, visible=True)
+
+            if do_train and is_matplotlib_available():
+                running_info["loss_viewer"] = gr.Plot(gen_loss_plot(trainer_log))
+
+    swanlab_config_path = os.path.join(output_path, SWANLAB_CONFIG)
+    if os.path.isfile(swanlab_config_path):
+        with open(swanlab_config_path, encoding="utf-8") as f:
+            swanlab_public_config = json.load(f)
+            swanlab_link = swanlab_public_config["cloud"]["experiment_url"]
+            if swanlab_link is not None:
+                running_info["swanlab_link"] = gr.Markdown(
+                    ALERTS["info_swanlab_link"][lang] + swanlab_link, visible=True
+                )
+
+    return running_log, running_progress, running_info
+
+
+def list_checkpoints(model_name: str, finetuning_type: str) -> "gr.Dropdown":
+    r"""List all available checkpoints.
+
+    Inputs: top.model_name, top.finetuning_type
+    Outputs: top.checkpoint_path
+    """
+    checkpoints = []
+    if model_name:
+        save_dir = get_save_dir(model_name, finetuning_type)
+        if save_dir and os.path.isdir(save_dir):
+            for checkpoint in os.listdir(save_dir):
+                if os.path.isdir(os.path.join(save_dir, checkpoint)) and any(
+                    os.path.isfile(os.path.join(save_dir, checkpoint, name)) for name in CHECKPOINT_NAMES
+                ):
+                    checkpoints.append(checkpoint)
+
+    if finetuning_type in PEFT_METHODS:
+        return gr.Dropdown(value=[], choices=checkpoints, multiselect=True)
+    else:
+        return gr.Dropdown(value=None, choices=checkpoints, multiselect=False)
+
+
+def list_config_paths(current_time: str) -> "gr.Dropdown":
+    r"""List all the saved configuration files.
+
+    Inputs: train.current_time
+    Outputs: train.config_path
+    """
+    config_files = [f"{current_time}.yaml"]
+    if os.path.isdir(DEFAULT_CONFIG_DIR):
+        for file_name in os.listdir(DEFAULT_CONFIG_DIR):
+            if file_name.endswith(".yaml") and file_name not in config_files:
+                config_files.append(file_name)
+
+    return gr.Dropdown(choices=config_files)
+
+
+def list_datasets(dataset_dir: str = None, training_stage: str = list(TRAINING_STAGES.keys())[0]) -> "gr.Dropdown":
+    r"""List all available datasets in the dataset dir for the training stage.
+
+    Inputs: *.dataset_dir, *.training_stage
+    Outputs: *.dataset
+    """
+    dataset_info = load_dataset_info(dataset_dir if dataset_dir is not None else DEFAULT_DATA_DIR)
+    ranking = TRAINING_STAGES[training_stage] in STAGES_USE_PAIR_DATA
+    datasets = [k for k, v in dataset_info.items() if v.get("ranking", False) == ranking]
+    return gr.Dropdown(choices=datasets)
+
+
+def list_output_dirs(model_name: str | None, finetuning_type: str, current_time: str) -> "gr.Dropdown":
+    r"""List all the directories that can resume from.
+
+    Inputs: top.model_name, top.finetuning_type, train.current_time
+    Outputs: train.output_dir
+    """
+    output_dirs = [f"train_{current_time}"]
+    if model_name:
+        save_dir = get_save_dir(model_name, finetuning_type)
+        if save_dir and os.path.isdir(save_dir):
+            for folder in os.listdir(save_dir):
+                output_dir = os.path.join(save_dir, folder)
+                if os.path.isdir(output_dir) and get_last_checkpoint(output_dir) is not None:
+                    output_dirs.append(folder)
+
+    return gr.Dropdown(choices=output_dirs)
diff --git a/LlamaFactory/src/llamafactory/webui/css.py b/LlamaFactory/src/llamafactory/webui/css.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7e4c3572907b4d2419b68f87513cc89db21ed06
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/css.py
@@ -0,0 +1,67 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CSS = r"""
+.duplicate-button {
+  margin: auto !important;
+  color: white !important;
+  background: black !important;
+  border-radius: 100vh !important;
+}
+
+.thinking-summary {
+  padding: 8px !important;
+}
+
+.thinking-summary span {
+  border-radius: 4px !important;
+  padding: 4px !important;
+  cursor: pointer !important;
+  font-size: 14px !important;
+  background: rgb(245, 245, 245) !important;
+}
+
+.dark .thinking-summary span {
+  background: rgb(73, 73, 73) !important;
+}
+
+.thinking-container {
+  border-left: 2px solid #a6a6a6 !important;
+  padding-left: 10px !important;
+  margin: 4px 0 !important;
+}
+
+.thinking-container p {
+  color: #a6a6a6 !important;
+}
+
+.modal-box {
+  position: fixed !important;
+  top: 50%;
+  left: 50%;
+  transform: translate(-50%, -50%); /* center horizontally */
+  max-width: 1000px;
+  max-height: 750px;
+  overflow-y: auto;
+  background-color: var(--input-background-fill);
+  flex-wrap: nowrap !important;
+  border: 2px solid black !important;
+  z-index: 1000;
+  padding: 10px;
+}
+
+.dark .modal-box {
+  border: 2px solid white !important;
+}
+"""
diff --git a/LlamaFactory/src/llamafactory/webui/engine.py b/LlamaFactory/src/llamafactory/webui/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb1aa443d6130726a0791088f200ce12fdf80655
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/engine.py
@@ -0,0 +1,83 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any
+
+from .chatter import WebChatModel
+from .common import create_ds_config, get_time, load_config
+from .locales import LOCALES
+from .manager import Manager
+from .runner import Runner
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+
+class Engine:
+    r"""A general engine to control the behaviors of Web UI."""
+
+    def __init__(self, demo_mode: bool = False, pure_chat: bool = False) -> None:
+        self.demo_mode = demo_mode
+        self.pure_chat = pure_chat
+        self.manager = Manager()
+        self.runner = Runner(self.manager, demo_mode)
+        self.chatter = WebChatModel(self.manager, demo_mode, lazy_init=(not pure_chat))
+        if not demo_mode:
+            create_ds_config()
+
+    def _update_component(self, input_dict: dict[str, dict[str, Any]]) -> dict["Component", "Component"]:
+        r"""Update gradio components according to the (elem_id, properties) mapping."""
+        output_dict: dict[Component, Component] = {}
+        for elem_id, elem_attr in input_dict.items():
+            elem = self.manager.get_elem_by_id(elem_id)
+            output_dict[elem] = elem.__class__(**elem_attr)
+
+        return output_dict
+
+    def resume(self):
+        r"""Get the initial value of gradio components and restores training status if necessary."""
+        user_config = load_config() if not self.demo_mode else {}  # do not use config in demo mode
+        lang = user_config.get("lang") or "en"
+        init_dict = {"top.lang": {"value": lang}, "infer.chat_box": {"visible": self.chatter.loaded}}
+
+        if not self.pure_chat:
+            current_time = get_time()
+            hub_name = user_config.get("hub_name") or "huggingface"
+            init_dict["top.hub_name"] = {"value": hub_name}
+            init_dict["train.current_time"] = {"value": current_time}
+            init_dict["train.output_dir"] = {"value": f"train_{current_time}"}
+            init_dict["train.config_path"] = {"value": f"{current_time}.yaml"}
+            init_dict["eval.output_dir"] = {"value": f"eval_{current_time}"}
+            init_dict["infer.mm_box"] = {"visible": False}
+
+            if user_config.get("last_model", None):
+                init_dict["top.model_name"] = {"value": user_config["last_model"]}
+
+        yield self._update_component(init_dict)
+
+        if self.runner.running and not self.demo_mode and not self.pure_chat:
+            yield {elem: elem.__class__(value=value) for elem, value in self.runner.running_data.items()}
+            if self.runner.do_train:
+                yield self._update_component({"train.resume_btn": {"value": True}})
+            else:
+                yield self._update_component({"eval.resume_btn": {"value": True}})
+
+    def change_lang(self, lang: str):
+        r"""Update the displayed language of gradio components."""
+        return {
+            elem: elem.__class__(**LOCALES[elem_name][lang])
+            for elem_name, elem in self.manager.get_elem_iter()
+            if elem_name in LOCALES
+        }
diff --git a/LlamaFactory/src/llamafactory/webui/interface.py b/LlamaFactory/src/llamafactory/webui/interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cb989b296ccca0c7fbace473f48ed5f3176a529
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/interface.py
@@ -0,0 +1,106 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import platform
+
+from ..extras.misc import fix_proxy, is_env_enabled
+from ..extras.packages import is_gradio_available
+from .common import save_config
+from .components import (
+    create_chat_box,
+    create_eval_tab,
+    create_export_tab,
+    create_footer,
+    create_infer_tab,
+    create_top,
+    create_train_tab,
+)
+from .css import CSS
+from .engine import Engine
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+def create_ui(demo_mode: bool = False) -> "gr.Blocks":
+    engine = Engine(demo_mode=demo_mode, pure_chat=False)
+    hostname = os.getenv("HOSTNAME", os.getenv("COMPUTERNAME", platform.node())).split(".")[0]
+
+    with gr.Blocks(title=f"LLaMA Factory ({hostname})", css=CSS) as demo:
+        title = gr.HTML()
+        subtitle = gr.HTML()
+        if demo_mode:
+            gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
+
+        engine.manager.add_elems("head", {"title": title, "subtitle": subtitle})
+        engine.manager.add_elems("top", create_top())
+        lang: gr.Dropdown = engine.manager.get_elem_by_id("top.lang")
+
+        with gr.Tab("Train"):
+            engine.manager.add_elems("train", create_train_tab(engine))
+
+        with gr.Tab("Evaluate & Predict"):
+            engine.manager.add_elems("eval", create_eval_tab(engine))
+
+        with gr.Tab("Chat"):
+            engine.manager.add_elems("infer", create_infer_tab(engine))
+
+        if not demo_mode:
+            with gr.Tab("Export"):
+                engine.manager.add_elems("export", create_export_tab(engine))
+
+        engine.manager.add_elems("footer", create_footer())
+        demo.load(engine.resume, outputs=engine.manager.get_elem_list(), concurrency_limit=None)
+        lang.change(engine.change_lang, [lang], engine.manager.get_elem_list(), queue=False)
+        lang.input(save_config, inputs=[lang], queue=False)
+
+    return demo
+
+
+def create_web_demo() -> "gr.Blocks":
+    engine = Engine(pure_chat=True)
+    hostname = os.getenv("HOSTNAME", os.getenv("COMPUTERNAME", platform.node())).split(".")[0]
+
+    with gr.Blocks(title=f"LLaMA Factory Web Demo ({hostname})", css=CSS) as demo:
+        lang = gr.Dropdown(choices=["en", "ru", "zh", "ko", "ja"], scale=1)
+        engine.manager.add_elems("top", dict(lang=lang))
+
+        _, _, chat_elems = create_chat_box(engine, visible=True)
+        engine.manager.add_elems("infer", chat_elems)
+
+        demo.load(engine.resume, outputs=engine.manager.get_elem_list(), concurrency_limit=None)
+        lang.change(engine.change_lang, [lang], engine.manager.get_elem_list(), queue=False)
+        lang.input(save_config, inputs=[lang], queue=False)
+
+    return demo
+
+
+def run_web_ui() -> None:
+    gradio_ipv6 = is_env_enabled("GRADIO_IPV6")
+    gradio_share = is_env_enabled("GRADIO_SHARE")
+    server_name = os.getenv("GRADIO_SERVER_NAME", "[::]" if gradio_ipv6 else "0.0.0.0")
+    print("Visit http://ip:port for Web UI, e.g., http://127.0.0.1:7860")
+    fix_proxy(ipv6_enabled=gradio_ipv6)
+    create_ui().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True)
+
+
+def run_web_demo() -> None:
+    gradio_ipv6 = is_env_enabled("GRADIO_IPV6")
+    gradio_share = is_env_enabled("GRADIO_SHARE")
+    server_name = os.getenv("GRADIO_SERVER_NAME", "[::]" if gradio_ipv6 else "0.0.0.0")
+    print("Visit http://ip:port for Web UI, e.g., http://127.0.0.1:7860")
+    fix_proxy(ipv6_enabled=gradio_ipv6)
+    create_web_demo().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True)
diff --git a/LlamaFactory/src/llamafactory/webui/locales.py b/LlamaFactory/src/llamafactory/webui/locales.py
new file mode 100644
index 0000000000000000000000000000000000000000..06e64e4a73cc645aeec9973caad357aa93f06e7c
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/locales.py
@@ -0,0 +1,3183 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+LOCALES = {
+    "title": {
+        "en": {
+            "value": "<h1><center>🦙🏭LLaMA Factory: Unified Efficient Fine-Tuning of 100+ LLMs</center></h1>",
+        },
+        "ru": {
+            "value": "<h1><center>🦙🏭LLaMA Factory: Унифицированная эффективная тонкая настройка 100+ LLMs</center></h1>",
+        },
+        "zh": {
+            "value": "<h1><center>🦙🏭LLaMA Factory: 一站式大模型高效微调平台</center></h1>",
+        },
+        "ko": {
+            "value": "<h1><center>🦙🏭LLaMA Factory: 100+ LLMs를 위한 통합 효율적인 튜닝</center></h1>",
+        },
+        "ja": {
+            "value": "<h1><center>🦙🏭LLaMA Factory: 100+ LLMs の統合効率的なチューニング</center></h1>",
+        },
+    },
+    "subtitle": {
+        "en": {
+            "value": (
+                "<h3><center>Visit <a href='https://github.com/hiyouga/LLaMA-Factory' target='_blank'>"
+                "GitHub Page</a> <a href='https://llamafactory.readthedocs.io/en/latest/' target='_blank'>"
+                "Documentation</a> <a href='https://blog.llamafactory.net/en/' target='_blank'>"
+                "Blog</a></center></h3>"
+            ),
+        },
+        "ru": {
+            "value": (
+                "<h3><center>Посетить <a href='https://github.com/hiyouga/LLaMA-Factory' target='_blank'>"
+                "страницу GitHub</a> <a href='https://llamafactory.readthedocs.io/en/latest/' target='_blank'>"
+                "Документацию</a> <a href='https://blog.llamafactory.net/en/' target='_blank'>"
+                "Блог</a></center></h3>"
+            ),
+        },
+        "zh": {
+            "value": (
+                "<h3><center>访问 <a href='https://github.com/hiyouga/LLaMA-Factory' target='_blank'>"
+                "GitHub 主页</a> <a href='https://llamafactory.readthedocs.io/zh-cn/latest/' target='_blank'>"
+                "官方文档</a> <a href='https://blog.llamafactory.net/' target='_blank'>"
+                "博客</a></center></h3>"
+            ),
+        },
+        "ko": {
+            "value": (
+                "<h3><center><a href='https://github.com/hiyouga/LLaMA-Factory' target='_blank'>"
+                "GitHub 페이지</a> <a href='https://llamafactory.readthedocs.io/en/latest/' target='_blank'>"
+                "공식 문서</a> <a href='https://blog.llamafactory.net/en/' target='_blank'>"
+                "블로그</a>를 방문하세요.</center></h3>"
+            ),
+        },
+        "ja": {
+            "value": (
+                "<h3><center><a href='https://github.com/hiyouga/LLaMA-Factory' target='_blank'>"
+                "GitHub ページ</a> <a href='https://llamafactory.readthedocs.io/en/latest/' target='_blank'>"
+                "ドキュメント</a> <a href='https://blog.llamafactory.net/en/' target='_blank'>"
+                "ブログ</a>にアクセスする</center></h3>"
+            ),
+        },
+    },
+    "lang": {
+        "en": {
+            "label": "Language",
+        },
+        "ru": {
+            "label": "Язык",
+        },
+        "zh": {
+            "label": "语言",
+        },
+        "ko": {
+            "label": "언어",
+        },
+        "ja": {
+            "label": "言語",
+        },
+    },
+    "model_name": {
+        "en": {
+            "label": "Model name",
+            "info": "Input the initial name to search for the model.",
+        },
+        "ru": {
+            "label": "Название модели",
+            "info": "Введите начальное имя для поиска модели.",
+        },
+        "zh": {
+            "label": "模型名称",
+            "info": "输入首单词以检索模型。",
+        },
+        "ko": {
+            "label": "모델 이름",
+            "info": "모델을 검색할 초기 이름을 입력하세요.",
+        },
+        "ja": {
+            "label": "モデル名",
+            "info": "モデルを検索するための初期名を入力してください。",
+        },
+    },
+    "model_path": {
+        "en": {
+            "label": "Model path",
+            "info": "Path to pretrained model or model identifier from Hugging Face.",
+        },
+        "ru": {
+            "label": "Путь к модели",
+            "info": "Путь к предварительно обученной модели или идентификатор модели от Hugging Face.",
+        },
+        "zh": {
+            "label": "模型路径",
+            "info": "本地模型的文件路径或 Hugging Face 的模型标识符。",
+        },
+        "ko": {
+            "label": "모델 경로",
+            "info": "사전 훈련된 모델의 경로 또는 Hugging Face의 모델 식별자.",
+        },
+        "ja": {
+            "label": "モデルパス",
+            "info": "事前学習済みモデルへのパス、または Hugging Face のモデル識別子。",
+        },
+    },
+    "hub_name": {
+        "en": {
+            "label": "Hub name",
+            "info": "Choose the model download source.",
+        },
+        "ru": {
+            "label": "Имя хаба",
+            "info": "Выберите источник загрузки модели.",
+        },
+        "zh": {
+            "label": "模型下载源",
+            "info": "选择模型下载源。（网络受限环境推荐使用 ModelScope）",
+        },
+        "ko": {
+            "label": "모델 다운로드 소스",
+            "info": "모델 다운로드 소스를 선택하세요.",
+        },
+        "ja": {
+            "label": "モデルダウンロードソース",
+            "info": "モデルをダウンロードするためのソースを選択してください。",
+        },
+    },
+    "finetuning_type": {
+        "en": {
+            "label": "Finetuning method",
+        },
+        "ru": {
+            "label": "Метод дообучения",
+        },
+        "zh": {
+            "label": "微调方法",
+        },
+        "ko": {
+            "label": "파인튜닝 방법",
+        },
+        "ja": {
+            "label": "ファインチューニング方法",
+        },
+    },
+    "checkpoint_path": {
+        "en": {
+            "label": "Checkpoint path",
+        },
+        "ru": {
+            "label": "Путь контрольной точки",
+        },
+        "zh": {
+            "label": "检查点路径",
+        },
+        "ko": {
+            "label": "체크포인트 경로",
+        },
+        "ja": {
+            "label": "チェックポイントパス",
+        },
+    },
+    "quantization_bit": {
+        "en": {
+            "label": "Quantization bit",
+            "info": "Enable quantization (QLoRA).",
+        },
+        "ru": {
+            "label": "Уровень квантования",
+            "info": "Включить квантование (QLoRA).",
+        },
+        "zh": {
+            "label": "量化等级",
+            "info": "启用量化（QLoRA）。",
+        },
+        "ko": {
+            "label": "양자화 비트",
+            "info": "양자화 활성화 (QLoRA).",
+        },
+        "ja": {
+            "label": "量子化ビット",
+            "info": "量子化を有効にする (QLoRA)。",
+        },
+    },
+    "quantization_method": {
+        "en": {
+            "label": "Quantization method",
+            "info": "Quantization algorithm to use.",
+        },
+        "ru": {
+            "label": "Метод квантования",
+            "info": "Алгоритм квантования, который следует использовать.",
+        },
+        "zh": {
+            "label": "量化方法",
+            "info": "使用的量化算法。",
+        },
+        "ko": {
+            "label": "양자화 방법",
+            "info": "사용할 양자화 알고리즘.",
+        },
+        "ja": {
+            "label": "量子化方法",
+            "info": "使用する量子化アルゴリズム。",
+        },
+    },
+    "template": {
+        "en": {
+            "label": "Chat template",
+            "info": "The chat template used in constructing prompts.",
+        },
+        "ru": {
+            "label": "Шаблон чата",
+            "info": "Шаблон чата используемый для составления подсказок.",
+        },
+        "zh": {
+            "label": "对话模板",
+            "info": "构建提示词时使用的模板。",
+        },
+        "ko": {
+            "label": "채팅 템플릿",
+            "info": "프롬프트 작성에 사용되는 채팅 템플릿.",
+        },
+        "ja": {
+            "label": "チャットテンプレート",
+            "info": "プロンプトの構築に使用されるチャットテンプレート。",
+        },
+    },
+    "rope_scaling": {
+        "en": {
+            "label": "RoPE scaling",
+            "info": "RoPE scaling method to use.",
+        },
+        "ru": {
+            "label": "Масштабирование RoPE",
+            "info": "Метод масштабирования RoPE для использования.",
+        },
+        "zh": {"label": "RoPE 插值方法", "info": "RoPE 插值时使用的方法。"},
+        "ko": {
+            "label": "RoPE 스케일링",
+            "info": "사용할 RoPE 스케일링 방법.",
+        },
+        "ja": {
+            "label": "RoPE スケーリング",
+            "info": "使用する RoPE スケーリング方法。",
+        },
+    },
+    "booster": {
+        "en": {
+            "label": "Booster",
+            "info": "Approach used to boost training speed.",
+        },
+        "ru": {
+            "label": "Ускоритель",
+            "info": "Подход, используемый для ускорения обучения.",
+        },
+        "zh": {"label": "加速方式", "info": "使用的加速方法。"},
+        "ko": {
+            "label": "부스터",
+            "info": "훈련 속도를 향상시키기 위해 사용된 접근 방식.",
+        },
+        "ja": {
+            "label": "ブースター",
+            "info": "トレーニング速度を向上させるためのアプローチ。",
+        },
+    },
+    "training_stage": {
+        "en": {
+            "label": "Stage",
+            "info": "The stage to perform in training.",
+        },
+        "ru": {
+            "label": "Этап",
+            "info": "Этап выполнения обучения.",
+        },
+        "zh": {
+            "label": "训练阶段",
+            "info": "目前采用的训练方式。",
+        },
+        "ko": {
+            "label": "학습 단계",
+            "info": "수행할 학습 방법.",
+        },
+        "ja": {
+            "label": "ステージ",
+            "info": "トレーニングで実行するステージ。",
+        },
+    },
+    "dataset_dir": {
+        "en": {
+            "label": "Data dir",
+            "info": "Path to the data directory.",
+        },
+        "ru": {
+            "label": "Директория данных",
+            "info": "Путь к директории данных.",
+        },
+        "zh": {
+            "label": "数据路径",
+            "info": "数据文件夹的路径。",
+        },
+        "ko": {
+            "label": "데이터 디렉토리",
+            "info": "데이터 디렉토리의 경로.",
+        },
+        "ja": {
+            "label": "データディレクトリ",
+            "info": "データディレクトリへのパス。",
+        },
+    },
+    "dataset": {
+        "en": {
+            "label": "Dataset",
+        },
+        "ru": {
+            "label": "Набор данных",
+        },
+        "zh": {
+            "label": "数据集",
+        },
+        "ko": {
+            "label": "데이터셋",
+        },
+        "ja": {
+            "label": "データセット",
+        },
+    },
+    "data_preview_btn": {
+        "en": {
+            "value": "Preview dataset",
+        },
+        "ru": {
+            "value": "Просмотреть набор данных",
+        },
+        "zh": {
+            "value": "预览数据集",
+        },
+        "ko": {
+            "value": "데이터셋 미리보기",
+        },
+        "ja": {
+            "value": "データセットをプレビュー",
+        },
+    },
+    "preview_count": {
+        "en": {
+            "label": "Count",
+        },
+        "ru": {
+            "label": "Количество",
+        },
+        "zh": {
+            "label": "数量",
+        },
+        "ko": {
+            "label": "개수",
+        },
+        "ja": {
+            "label": "カウント",
+        },
+    },
+    "page_index": {
+        "en": {
+            "label": "Page",
+        },
+        "ru": {
+            "label": "Страница",
+        },
+        "zh": {
+            "label": "页数",
+        },
+        "ko": {
+            "label": "페이지",
+        },
+        "ja": {
+            "label": "ページ",
+        },
+    },
+    "prev_btn": {
+        "en": {
+            "value": "Prev",
+        },
+        "ru": {
+            "value": "Предыдущая",
+        },
+        "zh": {
+            "value": "上一页",
+        },
+        "ko": {
+            "value": "이전",
+        },
+        "ja": {
+            "value": "前へ",
+        },
+    },
+    "next_btn": {
+        "en": {
+            "value": "Next",
+        },
+        "ru": {
+            "value": "Следующая",
+        },
+        "zh": {
+            "value": "下一页",
+        },
+        "ko": {
+            "value": "다음",
+        },
+        "ja": {
+            "value": "次へ",
+        },
+    },
+    "close_btn": {
+        "en": {
+            "value": "Close",
+        },
+        "ru": {
+            "value": "Закрыть",
+        },
+        "zh": {
+            "value": "关闭",
+        },
+        "ko": {
+            "value": "닫기",
+        },
+        "ja": {
+            "value": "閉じる",
+        },
+    },
+    "preview_samples": {
+        "en": {
+            "label": "Samples",
+        },
+        "ru": {
+            "label": "Примеры",
+        },
+        "zh": {
+            "label": "样例",
+        },
+        "ko": {
+            "label": "샘플",
+        },
+        "ja": {
+            "label": "サンプル",
+        },
+    },
+    "learning_rate": {
+        "en": {
+            "label": "Learning rate",
+            "info": "Initial learning rate for AdamW.",
+        },
+        "ru": {
+            "label": "Скорость обучения",
+            "info": "Начальная скорость обучения для AdamW.",
+        },
+        "zh": {
+            "label": "学习率",
+            "info": "AdamW 优化器的初始学习率。",
+        },
+        "ko": {
+            "label": "학습률",
+            "info": "AdamW의 초기 학습률.",
+        },
+        "ja": {
+            "label": "学習率",
+            "info": "AdamW の初期学習率。",
+        },
+    },
+    "num_train_epochs": {
+        "en": {
+            "label": "Epochs",
+            "info": "Total number of training epochs to perform.",
+        },
+        "ru": {
+            "label": "Эпохи",
+            "info": "Общее количество эпох обучения.",
+        },
+        "zh": {
+            "label": "训练轮数",
+            "info": "需要执行的训练总轮数。",
+        },
+        "ko": {
+            "label": "에포크",
+            "info": "수행할 총 학습 에포크 수.",
+        },
+        "ja": {
+            "label": "エポック数",
+            "info": "実行するトレーニングの総エポック数。",
+        },
+    },
+    "max_grad_norm": {
+        "en": {
+            "label": "Maximum gradient norm",
+            "info": "Norm for gradient clipping.",
+        },
+        "ru": {
+            "label": "Максимальная норма градиента",
+            "info": "Норма для обрезки градиента.",
+        },
+        "zh": {
+            "label": "最大梯度范数",
+            "info": "用于梯度裁剪的范数。",
+        },
+        "ko": {
+            "label": "최대 그레디언트 노름(norm)",
+            "info": "그레디언트 클리핑을 위한 노름(norm).",
+        },
+        "ja": {
+            "label": "最大勾配ノルム",
+            "info": "勾配クリッピングのためのノルム。",
+        },
+    },
+    "max_samples": {
+        "en": {
+            "label": "Max samples",
+            "info": "Maximum samples per dataset.",
+        },
+        "ru": {
+            "label": "Максимальное количество образцов",
+            "info": "Максимальное количество образцов на набор данных.",
+        },
+        "zh": {
+            "label": "最大样本数",
+            "info": "每个数据集的最大样本数。",
+        },
+        "ko": {
+            "label": "최대 샘플 수",
+            "info": "데이터셋 당 최대 샘플 수.",
+        },
+        "ja": {
+            "label": "最大サンプル数",
+            "info": "データセットごとの最大サンプル数。",
+        },
+    },
+    "compute_type": {
+        "en": {
+            "label": "Compute type",
+            "info": "Whether to use mixed precision training.",
+        },
+        "ru": {
+            "label": "Тип вычислений",
+            "info": "Использовать ли обучение смешанной точности.",
+        },
+        "zh": {
+            "label": "计算类型",
+            "info": "是否使用混合精度训练。",
+        },
+        "ko": {
+            "label": "연산 유형",
+            "info": "혼합 정밀도 훈련을 사용할지 여부.",
+        },
+        "ja": {
+            "label": "計算タイプ",
+            "info": "混合精度トレーニングを使用するかどうか。",
+        },
+    },
+    "cutoff_len": {
+        "en": {
+            "label": "Cutoff length",
+            "info": "Max tokens in input sequence.",
+        },
+        "ru": {
+            "label": "Длина обрезки",
+            "info": "Максимальное количество токенов во входной последовательности.",
+        },
+        "zh": {
+            "label": "截断长度",
+            "info": "输入序列分词后的最大长度。",
+        },
+        "ko": {
+            "label": "컷오프 길이",
+            "info": "입력 시퀀스의 최대 토큰 수.",
+        },
+        "ja": {
+            "label": "カットオフ長",
+            "info": "入力シーケンスの最大トークン数。",
+        },
+    },
+    "batch_size": {
+        "en": {
+            "label": "Batch size",
+            "info": "Number of samples processed on each GPU.",
+        },
+        "ru": {
+            "label": "Размер пакета",
+            "info": "Количество образцов для обработки на каждом GPU.",
+        },
+        "zh": {
+            "label": "批处理大小",
+            "info": "每个 GPU 处理的样本数量。",
+        },
+        "ko": {
+            "label": "배치 크기",
+            "info": "각 GPU에서 처리되는 샘플 수.",
+        },
+        "ja": {
+            "label": "バッチサイズ",
+            "info": "各 GPU で処理されるサンプル数。",
+        },
+    },
+    "gradient_accumulation_steps": {
+        "en": {
+            "label": "Gradient accumulation",
+            "info": "Number of steps for gradient accumulation.",
+        },
+        "ru": {
+            "label": "Накопление градиента",
+            "info": "Количество шагов накопления градиента.",
+        },
+        "zh": {
+            "label": "梯度累积",
+            "info": "梯度累积的步数。",
+        },
+        "ko": {
+            "label": "그레디언트 누적",
+            "info": "그레디언트 누적 단계 수.",
+        },
+        "ja": {
+            "label": "勾配累積",
+            "info": "勾配累積のステップ数。",
+        },
+    },
+    "val_size": {
+        "en": {
+            "label": "Val size",
+            "info": "Percentage of validation set from the entire dataset.",
+        },
+        "ru": {
+            "label": "Размер валидации",
+            "info": "Пропорция данных в наборе для разработки.",
+        },
+        "zh": {
+            "label": "验证集比例",
+            "info": "验证集占全部样本的百分比。",
+        },
+        "ko": {
+            "label": "검증 데이터셋 크기",
+            "info": "개발 데이터셋에서 검증 데이터의 비율.",
+        },
+        "ja": {
+            "label": "検証セットサイズ",
+            "info": "データセット全体に対する検証セットの割合。",
+        },
+    },
+    "lr_scheduler_type": {
+        "en": {
+            "label": "LR scheduler",
+            "info": "Name of the learning rate scheduler.",
+        },
+        "ru": {
+            "label": "Планировщик скорости обучения",
+            "info": "Название планировщика скорости обучения.",
+        },
+        "zh": {
+            "label": "学习率调节器",
+            "info": "学习率调度器的名称。",
+        },
+        "ko": {
+            "label": "LR 스케줄러",
+            "info": "학습률 스케줄러의 이름.",
+        },
+        "ja": {
+            "label": "学習率スケジューラ",
+            "info": "学習率スケジューラの名前。",
+        },
+    },
+    "extra_tab": {
+        "en": {
+            "label": "Extra configurations",
+        },
+        "ru": {
+            "label": "Дополнительные конфигурации",
+        },
+        "zh": {
+            "label": "其它参数设置",
+        },
+        "ko": {
+            "label": "추가 구성(configuration)",
+        },
+        "ja": {
+            "label": "追加設定",
+        },
+    },
+    "logging_steps": {
+        "en": {
+            "label": "Logging steps",
+            "info": "Number of steps between two logs.",
+        },
+        "ru": {
+            "label": "Шаги логирования",
+            "info": "Количество шагов между двумя записями в журнале.",
+        },
+        "zh": {
+            "label": "日志间隔",
+            "info": "每两次日志输出间的更新步数。",
+        },
+        "ko": {
+            "label": "로깅 스텝",
+            "info": "이전 로깅과 다음 로깅 간 스텝 수.",
+        },
+        "ja": {
+            "label": "ロギングステップ",
+            "info": "2 つのログ間のステップ数。",
+        },
+    },
+    "save_steps": {
+        "en": {
+            "label": "Save steps",
+            "info": "Number of steps between two checkpoints.",
+        },
+        "ru": {
+            "label": "Шаги сохранения",
+            "info": "Количество шагов между двумя контрольными точками.",
+        },
+        "zh": {
+            "label": "保存间隔",
+            "info": "每两次断点保存间的更新步数。",
+        },
+        "ko": {
+            "label": "저장 스텝",
+            "info": "이전 체크포인트와 다음 체크포인트 사이의 스텝 수.",
+        },
+        "ja": {
+            "label": "保存ステップ",
+            "info": "2 つのチェックポイント間のステップ数。",
+        },
+    },
+    "warmup_steps": {
+        "en": {
+            "label": "Warmup steps",
+            "info": "Number of steps used for warmup.",
+        },
+        "ru": {
+            "label": "Шаги прогрева",
+            "info": "Количество шагов, используемых для прогрева.",
+        },
+        "zh": {
+            "label": "预热步数",
+            "info": "学习率预热采用的步数。",
+        },
+        "ko": {
+            "label": "Warmup 스텝",
+            "info": "Warmup에 사용되는 스텝 수.",
+        },
+        "ja": {
+            "label": "ウォームアップステップ",
+            "info": "ウォームアップに使用されるステップ数。",
+        },
+    },
+    "neftune_alpha": {
+        "en": {
+            "label": "NEFTune alpha",
+            "info": "Magnitude of noise adding to embedding vectors.",
+        },
+        "ru": {
+            "label": "NEFTune alpha",
+            "info": "Величина шума, добавляемого к векторам вложений.",
+        },
+        "zh": {
+            "label": "NEFTune 噪声参数",
+            "info": "嵌入向量所添加的噪声大小。",
+        },
+        "ko": {
+            "label": "NEFTune 알파",
+            "info": "임베딩 벡터에 추가되는 노이즈의 크기.",
+        },
+        "ja": {
+            "label": "NEFTune alpha",
+            "info": "埋め込みベクトルに追加されるノイズの大きさ。",
+        },
+    },
+    "extra_args": {
+        "en": {
+            "label": "Extra arguments",
+            "info": "Extra arguments passed to the trainer in JSON format.",
+        },
+        "ru": {
+            "label": "Дополнительные аргументы",
+            "info": "Дополнительные аргументы, которые передаются тренеру в формате JSON.",
+        },
+        "zh": {
+            "label": "额外参数",
+            "info": "以 JSON 格式传递给训练器的额外参数。",
+        },
+        "ko": {
+            "label": "추가 인수",
+            "info": "JSON 형식으로 트레이너에게 전달할 추가 인수입니다.",
+        },
+        "ja": {
+            "label": "追加引数",
+            "info": "JSON 形式でトレーナーに渡される追加引数。",
+        },
+    },
+    "packing": {
+        "en": {
+            "label": "Pack sequences",
+            "info": "Pack sequences into samples of fixed length.",
+        },
+        "ru": {
+            "label": "Упаковка последовательностей",
+            "info": "Упаковка последовательностей в образцы фиксированной длины.",
+        },
+        "zh": {
+            "label": "序列打包",
+            "info": "将序列打包为等长样本。",
+        },
+        "ko": {
+            "label": "시퀀스 패킹",
+            "info": "고정된 길이의 샘플로 시퀀스를 패킹합니다.",
+        },
+        "ja": {
+            "label": "シーケンスパッキング",
+            "info": "シーケンスを固定長のサンプルにパッキングします。",
+        },
+    },
+    "neat_packing": {
+        "en": {
+            "label": "Use neat packing",
+            "info": "Avoid cross-attention between packed sequences.",
+        },
+        "ru": {
+            "label": "Используйте аккуратную упаковку",
+            "info": "избегайте перекрестного внимания между упакованными последовательностями.",
+        },
+        "zh": {
+            "label": "使用无污染打包",
+            "info": "避免打包后的序列产生交叉注意力。",
+        },
+        "ko": {
+            "label": "니트 패킹 사용",
+            "info": "패킹된 시퀀스 간의 크로스 어텐션을 피합니다.",
+        },
+        "ja": {
+            "label": "無汚染パッキングを使用",
+            "info": "パッキング後のシーケンス間のクロスアテンションを避けます。",
+        },
+    },
+    "train_on_prompt": {
+        "en": {
+            "label": "Train on prompt",
+            "info": "Disable the label mask on the prompt (only for SFT).",
+        },
+        "ru": {
+            "label": "Тренировка на подсказке",
+            "info": "Отключить маску меток на подсказке (только для SFT).",
+        },
+        "zh": {
+            "label": "学习提示词",
+            "info": "不在提示词的部分添加掩码（仅适用于 SFT）。",
+        },
+        "ko": {
+            "label": "프롬프트도 학습",
+            "info": "프롬프트에서 라벨 마스킹을 비활성화합니다 (SFT에만 해당).",
+        },
+        "ja": {
+            "label": "プロンプトで学習",
+            "info": "プロンプト部分にマスクを追加しない（SFT のみ）。",
+        },
+    },
+    "mask_history": {
+        "en": {
+            "label": "Mask history",
+            "info": "Train on the last turn only (only for SFT).",
+        },
+        "ru": {
+            "label": "История масок",
+            "info": "Тренироваться только на последнем шаге (только для SFT).",
+        },
+        "zh": {
+            "label": "不学习历史对话",
+            "info": "仅学习最后一轮对话（仅适用于 SFT）。",
+        },
+        "ko": {
+            "label": "히스토리 마스킹",
+            "info": "대화 데이터의 마지막 턴만 학습합니다 (SFT에만 해당).",
+        },
+        "ja": {
+            "label": "履歴をマスク",
+            "info": "最後のターンのみを学習する（SFT のみ）。",
+        },
+    },
+    "resize_vocab": {
+        "en": {
+            "label": "Resize token embeddings",
+            "info": "Resize the tokenizer vocab and the embedding layers.",
+        },
+        "ru": {
+            "label": "Изменение размера токенных эмбеддингов",
+            "info": "Изменить размер словаря токенизатора и слоев эмбеддинга.",
+        },
+        "zh": {
+            "label": "更改词表大小",
+            "info": "更改分词器词表和嵌入层的大小。",
+        },
+        "ko": {
+            "label": "토큰 임베딩의 사이즈 조정",
+            "info": "토크나이저 어휘와 임베딩 레이어의 크기를 조정합니다.",
+        },
+        "ja": {
+            "label": "トークン埋め込みのサイズ変更",
+            "info": "トークナイザーの語彙と埋め込み層のサイズを変更します。",
+        },
+    },
+    "use_llama_pro": {
+        "en": {
+            "label": "Enable LLaMA Pro",
+            "info": "Make the parameters in the expanded blocks trainable.",
+        },
+        "ru": {
+            "label": "Включить LLaMA Pro",
+            "info": "Сделать параметры в расширенных блоках обучаемыми.",
+        },
+        "zh": {
+            "label": "使用 LLaMA Pro",
+            "info": "仅训练块扩展后的参数。",
+        },
+        "ko": {
+            "label": "LLaMA Pro 사용",
+            "info": "확장된 블록의 매개변수를 학습 가능하게 만듭니다.",
+        },
+        "ja": {
+            "label": "LLaMA Pro を有効化",
+            "info": "拡張ブロックのパラメータのみをトレーニングします。",
+        },
+    },
+    "enable_thinking": {
+        "en": {
+            "label": "Enable thinking",
+            "info": "Whether or not to enable thinking mode for reasoning models.",
+        },
+        "ru": {
+            "label": "Включить мысли",
+            "info": "Включить режим мысли для моделей решающего характера.",
+        },
+        "zh": {
+            "label": "启用思考模式",
+            "info": "是否启用推理模型的思考模式。",
+        },
+        "ko": {
+            "label": "생각 모드 활성화",
+            "info": "추론 모델의 생각 모드를 활성화할지 여부.",
+        },
+        "ja": {
+            "label": "思考モードを有効化",
+            "info": "推論モデルの思考モードを有効にするかどうか。",
+        },
+    },
+    "report_to": {
+        "en": {
+            "label": "Enable external logger",
+            "info": "Use TensorBoard or wandb to log experiment.",
+        },
+        "ru": {
+            "label": "Включить внешний регистратор",
+            "info": "Использовать TensorBoard или wandb для ведения журнала экспериментов.",
+        },
+        "zh": {
+            "label": "启用外部记录面板",
+            "info": "使用 TensorBoard 或 wandb 记录实验。",
+        },
+        "ko": {
+            "label": "외부 logger 활성화",
+            "info": "TensorBoard 또는 wandb를 사용하여 실험을 기록합니다.",
+        },
+        "ja": {
+            "label": "外部ロガーを有効化",
+            "info": "TensorBoard または wandb を使用して実験を記録します。",
+        },
+    },
+    "freeze_tab": {
+        "en": {
+            "label": "Freeze tuning configurations",
+        },
+        "ru": {
+            "label": "конфигурации для настройки заморозки",
+        },
+        "zh": {
+            "label": "部分参数微调设置",
+        },
+        "ko": {
+            "label": "Freeze tuning 설정",
+        },
+        "ja": {
+            "label": "フリーズチューニング設定",
+        },
+    },
+    "freeze_trainable_layers": {
+        "en": {
+            "label": "Trainable layers",
+            "info": "Number of the last(+)/first(-) hidden layers to be set as trainable.",
+        },
+        "ru": {
+            "label": "Обучаемые слои",
+            "info": "Количество последних (+)/первых (-) скрытых слоев, которые будут установлены как обучаемые.",
+        },
+        "zh": {
+            "label": "可训练层数",
+            "info": "最末尾（+）/最前端（-）可训练隐藏层的数量。",
+        },
+        "ko": {
+            "label": "학습 가능한 레이어",
+            "info": "학습 가능하게 설정할 마지막(+)/처음(-) 히든 레이어의 수.",
+        },
+        "ja": {
+            "label": "学習可能なレイヤー",
+            "info": "最後（+）/最初（-）の学習可能な隠れ層の数。",
+        },
+    },
+    "freeze_trainable_modules": {
+        "en": {
+            "label": "Trainable modules",
+            "info": "Name(s) of trainable modules. Use commas to separate multiple modules.",
+        },
+        "ru": {
+            "label": "Обучаемые модули",
+            "info": "Название обучаемых модулей. Используйте запятые для разделения нескольких модулей.",
+        },
+        "zh": {
+            "label": "可训练模块",
+            "info": "可训练模块的名称。使用英文逗号分隔多个名称。",
+        },
+        "ko": {
+            "label": "학습 가능한 모듈",
+            "info": "학습 가능한 모듈의 이름. 여러 모듈을 구분하려면 쉼표(,)를 사용하세요.",
+        },
+        "ja": {
+            "label": "学習可能なモジュール",
+            "info": "学習可能なモジュールの名前。複数のモジュールを区切るにはカンマを使用します。",
+        },
+    },
+    "freeze_extra_modules": {
+        "en": {
+            "label": "Extra modules (optional)",
+            "info": (
+                "Name(s) of modules apart from hidden layers to be set as trainable. "
+                "Use commas to separate multiple modules."
+            ),
+        },
+        "ru": {
+            "label": "Дополнительные модули (опционально)",
+            "info": (
+                "Имена модулей, кроме скрытых слоев, которые следует установить в качестве обучаемых. "
+                "Используйте запятые для разделения нескольких модулей."
+            ),
+        },
+        "zh": {
+            "label": "额外模块（非必填）",
+            "info": "除隐藏层以外的可训练模块名称。使用英文逗号分隔多个名称。",
+        },
+        "ko": {
+            "label": "추가 모듈 (선택 사항)",
+            "info": "히든 레이어 외에 학습 가능하게 설정할 모듈의 이름. 모듈 간에는 쉼표(,)로 구분하십시오.",
+        },
+        "ja": {
+            "label": "追加モジュール（オプション）",
+            "info": "隠れ層以外の学習可能なモジュールの名前。複数のモジュールを区切るにはカンマを使用します。",
+        },
+    },
+    "lora_tab": {
+        "en": {
+            "label": "LoRA configurations",
+        },
+        "ru": {
+            "label": "Конфигурации LoRA",
+        },
+        "zh": {
+            "label": "LoRA 参数设置",
+        },
+        "ko": {
+            "label": "LoRA 구성",
+        },
+        "ja": {
+            "label": "LoRA 設定",
+        },
+    },
+    "lora_rank": {
+        "en": {
+            "label": "LoRA rank",
+            "info": "The rank of LoRA matrices.",
+        },
+        "ru": {
+            "label": "Ранг матриц LoRA",
+            "info": "Ранг матриц LoRA.",
+        },
+        "zh": {
+            "label": "LoRA 秩",
+            "info": "LoRA 矩阵的秩大小。",
+        },
+        "ko": {
+            "label": "LoRA 랭크",
+            "info": "LoRA 행렬의 랭크.",
+        },
+        "ja": {
+            "label": "LoRA ランク",
+            "info": "LoRA 行列のランク。",
+        },
+    },
+    "lora_alpha": {
+        "en": {
+            "label": "LoRA alpha",
+            "info": "Lora scaling coefficient.",
+        },
+        "ru": {
+            "label": "LoRA alpha",
+            "info": "Коэффициент масштабирования LoRA.",
+        },
+        "zh": {
+            "label": "LoRA 缩放系数",
+            "info": "LoRA 缩放系数大小。",
+        },
+        "ko": {
+            "label": "LoRA 알파",
+            "info": "LoRA 스케일링 계수.",
+        },
+        "ja": {
+            "label": "LoRA alpha",
+            "info": "LoRA スケーリング係数。",
+        },
+    },
+    "lora_dropout": {
+        "en": {
+            "label": "LoRA dropout",
+            "info": "Dropout ratio of LoRA weights.",
+        },
+        "ru": {
+            "label": "Вероятность отсева LoRA",
+            "info": "Вероятность отсева весов LoRA.",
+        },
+        "zh": {
+            "label": "LoRA 随机丢弃",
+            "info": "LoRA 权重随机丢弃的概率。",
+        },
+        "ko": {
+            "label": "LoRA 드롭아웃",
+            "info": "LoRA 가중치의 드롭아웃 비율.",
+        },
+        "ja": {
+            "label": "LoRA ドロップアウト",
+            "info": "LoRA 重みのドロップアウト確率。",
+        },
+    },
+    "loraplus_lr_ratio": {
+        "en": {
+            "label": "LoRA+ LR ratio",
+            "info": "The LR ratio of the B matrices in LoRA.",
+        },
+        "ru": {
+            "label": "LoRA+ LR коэффициент",
+            "info": "Коэффициент LR матриц B в LoRA.",
+        },
+        "zh": {
+            "label": "LoRA+ 学习率比例",
+            "info": "LoRA+ 中 B 矩阵的学习率倍数。",
+        },
+        "ko": {
+            "label": "LoRA+ LR 비율",
+            "info": "LoRA에서 B 행렬의 LR 비율.",
+        },
+        "ja": {
+            "label": "LoRA+ LR 比率",
+            "info": "LoRA+ の B 行列の学習率倍率。",
+        },
+    },
+    "create_new_adapter": {
+        "en": {
+            "label": "Create new adapter",
+            "info": "Create a new adapter with randomly initialized weight upon the existing one.",
+        },
+        "ru": {
+            "label": "Создать новый адаптер",
+            "info": "Создать новый адаптер с случайной инициализацией веса на основе существующего.",
+        },
+        "zh": {
+            "label": "新建适配器",
+            "info": "在现有的适配器上创建一个随机初始化后的新适配器。",
+        },
+        "ko": {
+            "label": "새 어댑터 생성",
+            "info": "기존 어댑터 위에 무작위로 초기화된 가중치를 가진 새 어댑터를 생성합니다.",
+        },
+        "ja": {
+            "label": "新しいアダプターを作成",
+            "info": "既存のアダプター上にランダムに初期化された新しいアダプターを作成します。",
+        },
+    },
+    "use_rslora": {
+        "en": {
+            "label": "Use rslora",
+            "info": "Use the rank stabilization scaling factor for LoRA layer.",
+        },
+        "ru": {
+            "label": "Использовать rslora",
+            "info": "Использовать коэффициент масштабирования стабилизации ранга для слоя LoRA.",
+        },
+        "zh": {
+            "label": "使用 rslora",
+            "info": "对 LoRA 层使用秩稳定缩放方法。",
+        },
+        "ko": {
+            "label": "rslora 사용",
+            "info": "LoRA 레이어에 랭크 안정화 스케일링 계수를 사용합니다.",
+        },
+        "ja": {
+            "label": "rslora を使用",
+            "info": "LoRA 層にランク安定化スケーリング方法を使用します。",
+        },
+    },
+    "use_dora": {
+        "en": {
+            "label": "Use DoRA",
+            "info": "Use weight-decomposed LoRA.",
+        },
+        "ru": {
+            "label": "Используйте DoRA",
+            "info": "Используйте LoRA с декомпозицией весов.",
+        },
+        "zh": {
+            "label": "使用 DoRA",
+            "info": "使用权重分解的 LoRA。",
+        },
+        "ko": {
+            "label": "DoRA 사용",
+            "info": "가중치-분해 LoRA를 사용합니다.",
+        },
+        "ja": {
+            "label": "DoRA を使用",
+            "info": "重み分解された LoRA を使用します。",
+        },
+    },
+    "use_pissa": {
+        "en": {
+            "label": "Use PiSSA",
+            "info": "Use PiSSA method.",
+        },
+        "ru": {
+            "label": "используйте PiSSA",
+            "info": "Используйте метод PiSSA.",
+        },
+        "zh": {
+            "label": "使用 PiSSA",
+            "info": "使用 PiSSA 方法。",
+        },
+        "ko": {
+            "label": "PiSSA 사용",
+            "info": "PiSSA 방법을 사용합니다.",
+        },
+        "ja": {
+            "label": "PiSSA を使用",
+            "info": "PiSSA メソッドを使用します。",
+        },
+    },
+    "lora_target": {
+        "en": {
+            "label": "LoRA modules (optional)",
+            "info": "Name(s) of modules to apply LoRA. Use commas to separate multiple modules.",
+        },
+        "ru": {
+            "label": "Модули LoRA (опционально)",
+            "info": "Имена модулей для применения LoRA. Используйте запятые для разделения нескольких модулей.",
+        },
+        "zh": {
+            "label": "LoRA 作用模块（非必填）",
+            "info": "应用 LoRA 的模块名称。使用英文逗号分隔多个名称。",
+        },
+        "ko": {
+            "label": "LoRA 모듈 (선택 사항)",
+            "info": "LoRA를 적용할 모듈의 이름. 모듈 간에는 쉼표(,)로 구분하십시오.",
+        },
+        "ja": {
+            "label": "LoRA モジュール（オプション）",
+            "info": "LoRA を適用するモジュールの名前。複数のモジュールを区切るにはカンマを使用します。",
+        },
+    },
+    "additional_target": {
+        "en": {
+            "label": "Additional modules (optional)",
+            "info": (
+                "Name(s) of modules apart from LoRA layers to be set as trainable. "
+                "Use commas to separate multiple modules."
+            ),
+        },
+        "ru": {
+            "label": "Дополнительные модули (опционально)",
+            "info": (
+                "Имена модулей, кроме слоев LoRA, которые следует установить в качестве обучаемых. "
+                "Используйте запятые для разделения нескольких модулей."
+            ),
+        },
+        "zh": {
+            "label": "附加模块（非必填）",
+            "info": "除 LoRA 层以外的可训练模块名称。使用英文逗号分隔多个名称。",
+        },
+        "ko": {
+            "label": "추가 모듈 (선택 사항)",
+            "info": "LoRA 레이어 외에 학습 가능하게 설정할 모듈의 이름. 모듈 간에는 쉼표(,)로 구분하십시오.",
+        },
+        "ja": {
+            "label": "追加モジュール（オプション）",
+            "info": "LoRA 層以外の学習可能なモジュールの名前。複数のモジュールを区切るにはカンマを使用します。",
+        },
+    },
+    "rlhf_tab": {
+        "en": {
+            "label": "RLHF configurations",
+        },
+        "ru": {
+            "label": "Конфигурации RLHF",
+        },
+        "zh": {
+            "label": "RLHF 参数设置",
+        },
+        "ko": {
+            "label": "RLHF 구성",
+        },
+        "ja": {
+            "label": "RLHF 設定",
+        },
+    },
+    "pref_beta": {
+        "en": {
+            "label": "Beta value",
+            "info": "Value of the beta parameter in the loss.",
+        },
+        "ru": {
+            "label": "Бета значение",
+            "info": "Значение параметра бета в функции потерь.",
+        },
+        "zh": {
+            "label": "Beta 参数",
+            "info": "损失函数中 beta 超参数大小。",
+        },
+        "ko": {
+            "label": "베타 값",
+            "info": "손실 함수에서 베타 매개 변수의 값.",
+        },
+        "ja": {
+            "label": "Beta 値",
+            "info": "損失関数における beta ハイパーパラメータの値。",
+        },
+    },
+    "pref_ftx": {
+        "en": {
+            "label": "Ftx gamma",
+            "info": "The weight of SFT loss in the final loss.",
+        },
+        "ru": {
+            "label": "Ftx гамма",
+            "info": "Вес потери SFT в итоговой потере.",
+        },
+        "zh": {
+            "label": "Ftx gamma",
+            "info": "损失函数中 SFT 损失的权重大小。",
+        },
+        "ko": {
+            "label": "Ftx 감마",
+            "info": "최종 로스 함수에서 SFT 로스의 가중치.",
+        },
+        "ja": {
+            "label": "Ftx gamma",
+            "info": "損失関数における SFT 損失の重み。",
+        },
+    },
+    "pref_loss": {
+        "en": {
+            "label": "Loss type",
+            "info": "The type of the loss function.",
+        },
+        "ru": {
+            "label": "Тип потерь",
+            "info": "Тип функции потерь.",
+        },
+        "zh": {
+            "label": "损失类型",
+            "info": "损失函数的类型。",
+        },
+        "ko": {
+            "label": "로스 유형",
+            "info": "로스 함수의 유형.",
+        },
+        "ja": {
+            "label": "損失タイプ",
+            "info": "損失関数のタイプ。",
+        },
+    },
+    "reward_model": {
+        "en": {
+            "label": "Reward model",
+            "info": "Adapter of the reward model in PPO training.",
+        },
+        "ru": {
+            "label": "Модель вознаграждения",
+            "info": "Адаптер модели вознаграждения для обучения PPO.",
+        },
+        "zh": {
+            "label": "奖励模型",
+            "info": "PPO 训练中奖励模型的适配器路径。",
+        },
+        "ko": {
+            "label": "리워드 모델",
+            "info": "PPO 학습에서 사용할 리워드 모델의 어댑터.",
+        },
+        "ja": {
+            "label": "報酬モデル",
+            "info": "PPO トレーニングにおける報酬モデルのアダプター。",
+        },
+    },
+    "ppo_score_norm": {
+        "en": {
+            "label": "Score norm",
+            "info": "Normalizing scores in PPO training.",
+        },
+        "ru": {
+            "label": "Норма оценок",
+            "info": "Нормализация оценок в тренировке PPO.",
+        },
+        "zh": {
+            "label": "归一化分数",
+            "info": "PPO 训练中归一化奖励分数。",
+        },
+        "ko": {
+            "label": "스코어 정규화",
+            "info": "PPO 학습에서 스코어를 정규화합니다.",
+        },
+        "ja": {
+            "label": "スコア正規化",
+            "info": "PPO トレーニングにおける報酬スコアの正規化。",
+        },
+    },
+    "ppo_whiten_rewards": {
+        "en": {
+            "label": "Whiten rewards",
+            "info": "Whiten the rewards in PPO training.",
+        },
+        "ru": {
+            "label": "Белые вознаграждения",
+            "info": "Осветлите вознаграждения в обучении PPO.",
+        },
+        "zh": {
+            "label": "白化奖励",
+            "info": "PPO 训练中将奖励分数做白化处理。",
+        },
+        "ko": {
+            "label": "보상 백화",
+            "info": "PPO 훈련에서 보상을 백화(Whiten)합니다.",
+        },
+        "ja": {
+            "label": "報酬のホワイトニング",
+            "info": "PPO トレーニングにおいて報酬スコアをホワイトニング処理します。",
+        },
+    },
+    "mm_tab": {
+        "en": {
+            "label": "Multimodal configurations",
+        },
+        "ru": {
+            "label": "Конфигурации мультимедиа",
+        },
+        "zh": {
+            "label": "多模态参数设置",
+        },
+        "ko": {
+            "label": "멀티모달 구성",
+        },
+        "ja": {
+            "label": "多モーダル設定",
+        },
+    },
+    "freeze_vision_tower": {
+        "en": {
+            "label": "Freeze vision tower",
+            "info": "Freeze the vision tower in the model.",
+        },
+        "ru": {
+            "label": "Заморозить башню визиона",
+            "info": "Заморозить башню визиона в модели.",
+        },
+        "zh": {
+            "label": "冻结视觉编码器",
+            "info": "冻结模型中的视觉编码器。",
+        },
+        "ko": {
+            "label": "비전 타워 고정",
+            "info": "모델의 비전 타워를 고정합니다.",
+        },
+        "ja": {
+            "label": "ビジョンタワーの固定",
+            "info": "モデルのビジョンタワーを固定します。",
+        },
+    },
+    "freeze_multi_modal_projector": {
+        "en": {
+            "label": "Freeze multi-modal projector",
+            "info": "Freeze the multi-modal projector in the model.",
+        },
+        "ru": {
+            "label": "Заморозить мультимодальный проектор",
+            "info": "Заморозить мультимодальный проектор в модели.",
+        },
+        "zh": {
+            "label": "冻结多模态投影器",
+            "info": "冻结模型中的多模态投影器。",
+        },
+        "ko": {
+            "label": "멀티모달 프로젝터 고정",
+            "info": "모델의 멀티모달 프로젝터를 고정합니다.",
+        },
+        "ja": {
+            "label": "多モーダルプロジェクターの固定",
+            "info": "モデルの多モーダルプロジェクターを固定します。",
+        },
+    },
+    "freeze_language_model": {
+        "en": {
+            "label": "Freeze language model",
+            "info": "Freeze the language model in the model.",
+        },
+        "ru": {
+            "label": "Заморозить язык модели",
+            "info": "Заморозить язык модели в модели.",
+        },
+        "zh": {
+            "label": "冻结语言模型",
+            "info": "冻结模型中的语言模型。",
+        },
+        "ko": {
+            "label": "언어 모델 고정",
+            "info": "모델의 언어 모델을 고정합니다.",
+        },
+        "ja": {
+            "label": "言語モデルの固定",
+            "info": "モデルの言語モデルを固定します。",
+        },
+    },
+    "image_max_pixels": {
+        "en": {
+            "label": "Image max pixels",
+            "info": "The maximum number of pixels of image inputs.",
+        },
+        "ru": {
+            "label": "Максимальное количество пикселей изображения",
+            "info": "Максимальное количество пикселей изображения.",
+        },
+        "zh": {
+            "label": "图像最大像素",
+            "info": "输入图像的最大像素数。",
+        },
+        "ko": {
+            "label": "이미지 최대 픽셀",
+            "info": "이미지 입력의 최대 픽셀 수입니다.",
+        },
+        "ja": {
+            "label": "画像最大ピクセル",
+            "info": "画像入力の最大ピクセル数です。",
+        },
+    },
+    "image_min_pixels": {
+        "en": {
+            "label": "Image min pixels",
+            "info": "The minimum number of pixels of image inputs.",
+        },
+        "ru": {
+            "label": "Минимальное количество пикселей изображения",
+            "info": "Минимальное количество пикселей изображения.",
+        },
+        "zh": {
+            "label": "图像最小像素",
+            "info": "输入图像的最小像素数。",
+        },
+        "ko": {
+            "label": "이미지 최소 픽셀",
+            "info": "이미지 입력의 최소 픽셀 수입니다.",
+        },
+        "ja": {
+            "label": "画像最小ピクセル",
+            "info": "画像入力の最小ピクセル数です。",
+        },
+    },
+    "video_max_pixels": {
+        "en": {
+            "label": "Video max pixels",
+            "info": "The maximum number of pixels of video inputs.",
+        },
+        "ru": {
+            "label": "Максимальное количество пикселей видео",
+            "info": "Максимальное количество пикселей видео.",
+        },
+        "zh": {
+            "label": "视频最大像素",
+            "info": "输入视频的最大像素数。",
+        },
+        "ko": {
+            "label": "비디오 최대 픽셀",
+            "info": "비디오 입력의 최대 픽셀 수입니다.",
+        },
+        "ja": {
+            "label": "ビデオ最大ピクセル",
+            "info": "ビデオ入力の最大ピクセル数です。",
+        },
+    },
+    "video_min_pixels": {
+        "en": {
+            "label": "Video min pixels",
+            "info": "The minimum number of pixels of video inputs.",
+        },
+        "ru": {
+            "label": "Минимальное количество пикселей видео",
+            "info": "Минимальное количество пикселей видео.",
+        },
+        "zh": {
+            "label": "视频最小像素",
+            "info": "输入视频的最小像素数。",
+        },
+        "ko": {
+            "label": "비디오 최소 픽셀",
+            "info": "비디오 입력의 최소 픽셀 수입니다.",
+        },
+        "ja": {
+            "label": "ビデオ最小ピクセル",
+            "info": "ビデオ入力の最小ピクセル数です。",
+        },
+    },
+    "galore_tab": {
+        "en": {
+            "label": "GaLore configurations",
+        },
+        "ru": {
+            "label": "Конфигурации GaLore",
+        },
+        "zh": {
+            "label": "GaLore 参数设置",
+        },
+        "ko": {
+            "label": "GaLore 구성",
+        },
+        "ja": {
+            "label": "GaLore 設定",
+        },
+    },
+    "use_galore": {
+        "en": {
+            "label": "Use GaLore",
+            "info": "Use [GaLore](https://github.com/jiaweizzhao/GaLore) optimizer.",
+        },
+        "ru": {
+            "label": "Использовать GaLore",
+            "info": "Используйте оптимизатор [GaLore](https://github.com/jiaweizzhao/GaLore).",
+        },
+        "zh": {
+            "label": "使用 GaLore",
+            "info": "使用 [GaLore](https://github.com/jiaweizzhao/GaLore) 优化器。",
+        },
+        "ko": {
+            "label": "GaLore 사용",
+            "info": "[GaLore](https://github.com/jiaweizzhao/GaLore) 최적화를 사용하세요.",
+        },
+        "ja": {
+            "label": "GaLore を使用",
+            "info": "[GaLore](https://github.com/jiaweizzhao/GaLore) オプティマイザーを使用します。",
+        },
+    },
+    "galore_rank": {
+        "en": {
+            "label": "GaLore rank",
+            "info": "The rank of GaLore gradients.",
+        },
+        "ru": {
+            "label": "Ранг GaLore",
+            "info": "Ранг градиентов GaLore.",
+        },
+        "zh": {
+            "label": "GaLore 秩",
+            "info": "GaLore 梯度的秩大小。",
+        },
+        "ko": {
+            "label": "GaLore 랭크",
+            "info": "GaLore 그레디언트의 랭크.",
+        },
+        "ja": {
+            "label": "GaLore ランク",
+            "info": "GaLore 勾配のランク。",
+        },
+    },
+    "galore_update_interval": {
+        "en": {
+            "label": "Update interval",
+            "info": "Number of steps to update the GaLore projection.",
+        },
+        "ru": {
+            "label": "Интервал обновления",
+            "info": "Количество шагов для обновления проекции GaLore.",
+        },
+        "zh": {
+            "label": "更新间隔",
+            "info": "相邻两次投影更新的步数。",
+        },
+        "ko": {
+            "label": "업데이트 간격",
+            "info": "GaLore 프로젝션을 업데이트할 간격의 스텝 수.",
+        },
+        "ja": {
+            "label": "更新間隔",
+            "info": "隣接する 2 回の投影更新間のステップ数。",
+        },
+    },
+    "galore_scale": {
+        "en": {
+            "label": "GaLore scale",
+            "info": "GaLore scaling coefficient.",
+        },
+        "ru": {
+            "label": "LoRA Alpha",
+            "info": "Коэффициент масштабирования GaLore.",
+        },
+        "zh": {
+            "label": "GaLore 缩放系数",
+            "info": "GaLore 缩放系数大小。",
+        },
+        "ko": {
+            "label": "GaLore 스케일",
+            "info": "GaLore 스케일링 계수.",
+        },
+        "ja": {
+            "label": "GaLore スケール",
+            "info": "GaLore スケーリング係数。",
+        },
+    },
+    "galore_target": {
+        "en": {
+            "label": "GaLore modules",
+            "info": "Name(s) of modules to apply GaLore. Use commas to separate multiple modules.",
+        },
+        "ru": {
+            "label": "Модули GaLore",
+            "info": "Имена модулей для применения GaLore. Используйте запятые для разделения нескольких модулей.",
+        },
+        "zh": {
+            "label": "GaLore 作用模块",
+            "info": "应用 GaLore 的模块名称。使用英文逗号分隔多个名称。",
+        },
+        "ko": {
+            "label": "GaLore 모듈",
+            "info": "GaLore를 적용할 모듈의 이름. 모듈 간에는 쉼표(,)로 구분하십시오.",
+        },
+        "ja": {
+            "label": "GaLore モジュール",
+            "info": "GaLore を適用するモジュールの名前。複数のモジュールを区切るにはカンマを使用します。",
+        },
+    },
+    "apollo_tab": {
+        "en": {
+            "label": "APOLLO configurations",
+        },
+        "ru": {
+            "label": "Конфигурации APOLLO",
+        },
+        "zh": {
+            "label": "APOLLO 参数设置",
+        },
+        "ko": {
+            "label": "APOLLO 구성",
+        },
+        "ja": {
+            "label": "APOLLO 設定",
+        },
+    },
+    "use_apollo": {
+        "en": {
+            "label": "Use APOLLO",
+            "info": "Use [APOLLO](https://github.com/zhuhanqing/APOLLO) optimizer.",
+        },
+        "ru": {
+            "label": "Использовать APOLLO",
+            "info": "Используйте оптимизатор [APOLLO](https://github.com/zhuhanqing/APOLLO).",
+        },
+        "zh": {
+            "label": "使用 APOLLO",
+            "info": "使用 [APOLLO](https://github.com/zhuhanqing/APOLLO) 优化器。",
+        },
+        "ko": {
+            "label": "APOLLO 사용",
+            "info": "[APOLLO](https://github.com/zhuhanqing/APOLLO) 최적화를 사용하세요.",
+        },
+        "ja": {
+            "label": "APOLLO を使用",
+            "info": "[APOLLO](https://github.com/zhuhanqing/APOLLO) オプティマイザーを使用します。",
+        },
+    },
+    "apollo_rank": {
+        "en": {
+            "label": "APOLLO rank",
+            "info": "The rank of APOLLO gradients.",
+        },
+        "ru": {
+            "label": "Ранг APOLLO",
+            "info": "Ранг градиентов APOLLO.",
+        },
+        "zh": {
+            "label": "APOLLO 秩",
+            "info": "APOLLO 梯度的秩大小。",
+        },
+        "ko": {
+            "label": "APOLLO 랭크",
+            "info": "APOLLO 그레디언트의 랭크.",
+        },
+        "ja": {
+            "label": "APOLLO ランク",
+            "info": "APOLLO 勾配のランク。",
+        },
+    },
+    "apollo_update_interval": {
+        "en": {
+            "label": "Update interval",
+            "info": "Number of steps to update the APOLLO projection.",
+        },
+        "ru": {
+            "label": "Интервал обновления",
+            "info": "Количество шагов для обновления проекции APOLLO.",
+        },
+        "zh": {
+            "label": "更新间隔",
+            "info": "相邻两次投影更新的步数。",
+        },
+        "ko": {
+            "label": "업데이트 간격",
+            "info": "APOLLO 프로젝션을 업데이트할 간격의 스텝 수.",
+        },
+        "ja": {
+            "label": "更新間隔",
+            "info": "隣接する 2 回の投影更新間のステップ数。",
+        },
+    },
+    "apollo_scale": {
+        "en": {
+            "label": "APOLLO scale",
+            "info": "APOLLO scaling coefficient.",
+        },
+        "ru": {
+            "label": "LoRA Alpha",
+            "info": "Коэффициент масштабирования APOLLO.",
+        },
+        "zh": {
+            "label": "APOLLO 缩放系数",
+            "info": "APOLLO 缩放系数大小。",
+        },
+        "ko": {
+            "label": "APOLLO 스케일",
+            "info": "APOLLO 스케일링 계수.",
+        },
+        "ja": {
+            "label": "APOLLO スケール",
+            "info": "APOLLO スケーリング係数。",
+        },
+    },
+    "apollo_target": {
+        "en": {
+            "label": "APOLLO modules",
+            "info": "Name(s) of modules to apply APOLLO. Use commas to separate multiple modules.",
+        },
+        "ru": {
+            "label": "Модули APOLLO",
+            "info": "Имена модулей для применения APOLLO. Используйте запятые для разделения нескольких модулей.",
+        },
+        "zh": {
+            "label": "APOLLO 作用模块",
+            "info": "应用 APOLLO 的模块名称。使用英文逗号分隔多个名称。",
+        },
+        "ko": {
+            "label": "APOLLO 모듈",
+            "info": "APOLLO를 적용할 모듈의 이름. 모듈 간에는 쉼표(,)로 구분하십시오.",
+        },
+        "ja": {
+            "label": "APOLLO モジュール",
+            "info": "APOLLO を適用するモジュールの名前。複数のモジュールを区切るにはカンマを使用します。",
+        },
+    },
+    "badam_tab": {
+        "en": {
+            "label": "BAdam configurations",
+        },
+        "ru": {
+            "label": "Конфигурации BAdam",
+        },
+        "zh": {
+            "label": "BAdam 参数设置",
+        },
+        "ko": {
+            "label": "BAdam 설정",
+        },
+        "ja": {
+            "label": "BAdam 設定",
+        },
+    },
+    "use_badam": {
+        "en": {
+            "label": "Use BAdam",
+            "info": "Enable the [BAdam](https://github.com/Ledzy/BAdam) optimizer.",
+        },
+        "ru": {
+            "label": "Использовать BAdam",
+            "info": "Включите оптимизатор [BAdam](https://github.com/Ledzy/BAdam).",
+        },
+        "zh": {
+            "label": "使用 BAdam",
+            "info": "使用 [BAdam](https://github.com/Ledzy/BAdam) 优化器。",
+        },
+        "ko": {
+            "label": "BAdam 사용",
+            "info": "[BAdam](https://github.com/Ledzy/BAdam) 옵티마이저를 사용합니다.",
+        },
+        "ja": {
+            "label": "BAdam を使用",
+            "info": "[BAdam](https://github.com/Ledzy/BAdam) オプティマイザーを使用します。",
+        },
+    },
+    "badam_mode": {
+        "en": {
+            "label": "BAdam mode",
+            "info": "Whether to use layer-wise or ratio-wise BAdam optimizer.",
+        },
+        "ru": {
+            "label": "Режим BAdam",
+            "info": "Использовать ли оптимизатор BAdam с послоевой или пропорциональной настройкой.",
+        },
+        "zh": {
+            "label": "BAdam 模式",
+            "info": "使用 layer-wise 或 ratio-wise BAdam 优化器。",
+        },
+        "ko": {
+            "label": "BAdam 모드",
+            "info": "레이어-BAdam 옵티마이저인지 비율-BAdam 옵티마이저인지.",
+        },
+        "ja": {
+            "label": "BAdam モード",
+            "info": "layer-wise または ratio-wise BAdam オプティマイザーを使用します。",
+        },
+    },
+    "badam_switch_mode": {
+        "en": {
+            "label": "Switch mode",
+            "info": "The strategy of picking block to update for layer-wise BAdam.",
+        },
+        "ru": {
+            "label": "Режим переключения",
+            "info": "Стратегия выбора блока для обновления для послойного BAdam.",
+        },
+        "zh": {
+            "label": "切换策略",
+            "info": "Layer-wise BAdam 优化器的块切换策略。",
+        },
+        "ko": {
+            "label": "스위치 모드",
+            "info": "레이어-BAdam을 위한 블록 선택 전략.",
+        },
+        "ja": {
+            "label": "切り替え戦略",
+            "info": "Layer-wise BAdam オプティマイザーのブロック切り替え戦略。",
+        },
+    },
+    "badam_switch_interval": {
+        "en": {
+            "label": "Switch interval",
+            "info": "Number of steps to update the block for layer-wise BAdam.",
+        },
+        "ru": {
+            "label": "Интервал переключения",
+            "info": "количество шагов для обновления блока для пошагового BAdam.",
+        },
+        "zh": {
+            "label": "切换频率",
+            "info": "Layer-wise BAdam 优化器的块切换频率。",
+        },
+        "ko": {
+            "label": "전환 간격",
+            "info": "레이어-BAdam을 위한 블록 업데이트 간 스텝 수.",
+        },
+        "ja": {
+            "label": "切り替え頻度",
+            "info": "Layer-wise BAdam オプティマイザーのブロック切り替え頻度。",
+        },
+    },
+    "badam_update_ratio": {
+        "en": {
+            "label": "Update ratio",
+            "info": "The ratio of the update for ratio-wise BAdam.",
+        },
+        "ru": {
+            "label": "Коэффициент обновления",
+            "info": "Коэффициент обновления для BAdam с учётом соотношений.",
+        },
+        "zh": {
+            "label": "Block 更新比例",
+            "info": "Ratio-wise BAdam 优化器的更新比例。",
+        },
+        "ko": {
+            "label": "업데이트 비율",
+            "info": "비율-BAdam의 업데이트 비율.",
+        },
+        "ja": {
+            "label": "ブロック更新比率",
+            "info": "Ratio-wise BAdam オプティマイザーの更新比率。",
+        },
+    },
+    "swanlab_tab": {
+        "en": {
+            "label": "SwanLab configurations",
+        },
+        "ru": {
+            "label": "Конфигурации SwanLab",
+        },
+        "zh": {
+            "label": "SwanLab 参数设置",
+        },
+        "ko": {
+            "label": "SwanLab 설정",
+        },
+        "ja": {
+            "label": "SwanLab 設定",
+        },
+    },
+    "use_swanlab": {
+        "en": {
+            "label": "Use SwanLab",
+            "info": "Enable [SwanLab](https://swanlab.cn/) for experiment tracking and visualization.",
+        },
+        "ru": {
+            "label": "Использовать SwanLab",
+            "info": "Включить [SwanLab](https://swanlab.cn/) для отслеживания и визуализации экспериментов.",
+        },
+        "zh": {
+            "label": "使用 SwanLab",
+            "info": "启用 [SwanLab](https://swanlab.cn/) 进行实验跟踪和可视化。",
+        },
+        "ko": {
+            "label": "SwanLab 사용",
+            "info": "[SwanLab](https://swanlab.cn/) 를 사용하여 실험을 추적하고 시각화합니다.",
+        },
+        "ja": {
+            "label": "SwanLab を使用",
+            "info": "[SwanLab](https://swanlab.cn/) を有効にして実験の追跡と可視化を行います。",
+        },
+    },
+    "swanlab_project": {
+        "en": {
+            "label": "SwanLab project",
+        },
+        "ru": {
+            "label": "SwanLab Проект",
+        },
+        "zh": {
+            "label": "SwanLab 项目名",
+        },
+        "ko": {
+            "label": "SwanLab 프로젝트",
+        },
+        "ja": {
+            "label": "SwanLab プロジェクト",
+        },
+    },
+    "swanlab_run_name": {
+        "en": {
+            "label": "SwanLab experiment name (optional)",
+        },
+        "ru": {
+            "label": "SwanLab Имя эксперимента (опционально)",
+        },
+        "zh": {
+            "label": "SwanLab 实验名（非必填）",
+        },
+        "ko": {
+            "label": "SwanLab 실험 이름 (선택 사항)",
+        },
+        "ja": {
+            "label": "SwanLab 実験名（オプション）",
+        },
+    },
+    "swanlab_workspace": {
+        "en": {
+            "label": "SwanLab workspace (optional)",
+            "info": "Workspace for SwanLab. Defaults to the personal workspace.",
+        },
+        "ru": {
+            "label": "SwanLab Рабочая область (опционально)",
+            "info": "Рабочая область SwanLab, если не заполнено, то по умолчанию в личной рабочей области.",
+        },
+        "zh": {
+            "label": "SwanLab 工作区（非必填）",
+            "info": "SwanLab 的工作区，默认在个人工作区下。",
+        },
+        "ko": {
+            "label": "SwanLab 작업 영역 (선택 사항)",
+            "info": "SwanLab 조직의 작업 영역, 비어 있으면 기본적으로 개인 작업 영역에 있습니다.",
+        },
+        "ja": {
+            "label": "SwanLab ワークスペース（オプション）",
+            "info": "SwanLab のワークスペース。デフォルトでは個人ワークスペースです。",
+        },
+    },
+    "swanlab_api_key": {
+        "en": {
+            "label": "SwanLab API key (optional)",
+            "info": "API key for SwanLab.",
+        },
+        "ru": {
+            "label": "SwanLab API ключ (опционально)",
+            "info": "API ключ для SwanLab.",
+        },
+        "zh": {
+            "label": "SwanLab API 密钥（非必填）",
+            "info": "用于在编程环境登录 SwanLab，已登录则无需填写。",
+        },
+        "ko": {
+            "label": "SwanLab API 키 (선택 사항)",
+            "info": "SwanLab의 API 키.",
+        },
+        "ja": {
+            "label": "SwanLab API キー（オプション）",
+            "info": "SwanLab の API キー。",
+        },
+    },
+    "swanlab_mode": {
+        "en": {
+            "label": "SwanLab mode",
+            "info": "Cloud or offline version.",
+        },
+        "ru": {
+            "label": "SwanLab Режим",
+            "info": "Версия в облаке или локальная версия.",
+        },
+        "zh": {
+            "label": "SwanLab 模式",
+            "info": "使用云端版或离线版 SwanLab。",
+        },
+        "ko": {
+            "label": "SwanLab 모드",
+            "info": "클라우드 버전 또는 오프라인 버전.",
+        },
+        "ja": {
+            "label": "SwanLab モード",
+            "info": "クラウド版またはオフライン版 SwanLab を使用します。",
+        },
+    },
+    "swanlab_logdir": {
+        "en": {
+            "label": "SwanLab log directory",
+            "info": "The log directory for SwanLab.",
+        },
+        "ru": {
+            "label": "SwanLab 로그 디렉토리",
+            "info": "SwanLab의 로그 디렉토리.",
+        },
+        "zh": {
+            "label": "SwanLab 日志目录",
+            "info": "SwanLab 的日志目录。",
+        },
+        "ko": {
+            "label": "SwanLab 로그 디렉토리",
+            "info": "SwanLab의 로그 디렉토리.",
+        },
+        "ja": {
+            "label": "SwanLab ログ ディレクトリ",
+            "info": "SwanLab のログ ディレクトリ。",
+        },
+    },
+    "cmd_preview_btn": {
+        "en": {
+            "value": "Preview command",
+        },
+        "ru": {
+            "value": "Просмотр команды",
+        },
+        "zh": {
+            "value": "预览命令",
+        },
+        "ko": {
+            "value": "명령어 미리보기",
+        },
+        "ja": {
+            "value": "コマンドをプレビュー",
+        },
+    },
+    "arg_save_btn": {
+        "en": {
+            "value": "Save arguments",
+        },
+        "ru": {
+            "value": "Сохранить аргументы",
+        },
+        "zh": {
+            "value": "保存训练参数",
+        },
+        "ko": {
+            "value": "Argument 저장",
+        },
+        "ja": {
+            "value": "引数を保存",
+        },
+    },
+    "arg_load_btn": {
+        "en": {
+            "value": "Load arguments",
+        },
+        "ru": {
+            "value": "Загрузить аргументы",
+        },
+        "zh": {
+            "value": "载入训练参数",
+        },
+        "ko": {
+            "value": "Argument 불러오기",
+        },
+        "ja": {
+            "value": "引数を読み込む",
+        },
+    },
+    "start_btn": {
+        "en": {
+            "value": "Start",
+        },
+        "ru": {
+            "value": "Начать",
+        },
+        "zh": {
+            "value": "开始",
+        },
+        "ko": {
+            "value": "시작",
+        },
+        "ja": {
+            "value": "開始",
+        },
+    },
+    "stop_btn": {
+        "en": {
+            "value": "Abort",
+        },
+        "ru": {
+            "value": "Прервать",
+        },
+        "zh": {
+            "value": "中断",
+        },
+        "ko": {
+            "value": "중단",
+        },
+        "ja": {
+            "value": "中断",
+        },
+    },
+    "output_dir": {
+        "en": {
+            "label": "Output dir",
+            "info": "Directory for saving results.",
+        },
+        "ru": {
+            "label": "Выходной каталог",
+            "info": "Каталог для сохранения результатов.",
+        },
+        "zh": {
+            "label": "输出目录",
+            "info": "保存结果的路径。",
+        },
+        "ko": {
+            "label": "출력 디렉토리",
+            "info": "결과를 저장할 디렉토리.",
+        },
+        "ja": {
+            "label": "出力ディレクトリ",
+            "info": "結果を保存するパス。",
+        },
+    },
+    "config_path": {
+        "en": {
+            "label": "Config path",
+            "info": "Path to config saving arguments.",
+        },
+        "ru": {
+            "label": "Путь к конфигурации",
+            "info": "Путь для сохранения аргументов конфигурации.",
+        },
+        "zh": {
+            "label": "配置路径",
+            "info": "保存训练参数的配置文件路径。",
+        },
+        "ko": {
+            "label": "설정 경로",
+            "info": "Arguments 저장 파일 경로.",
+        },
+        "ja": {
+            "label": "設定パス",
+            "info": "トレーニングパラメータを保存する設定ファイルのパス。",
+        },
+    },
+    "device_count": {
+        "en": {
+            "label": "Device count",
+            "info": "Number of devices available.",
+        },
+        "ru": {
+            "label": "Количество устройств",
+            "info": "Количество доступных устройств.",
+        },
+        "zh": {
+            "label": "设备数量",
+            "info": "当前可用的运算设备数。",
+        },
+        "ko": {
+            "label": "디바이스 수",
+            "info": "사용 가능한 디바이스 수.",
+        },
+        "ja": {
+            "label": "デバイス数",
+            "info": "現在利用可能な演算デバイス数。",
+        },
+    },
+    "ds_stage": {
+        "en": {
+            "label": "DeepSpeed stage",
+            "info": "DeepSpeed stage for distributed training.",
+        },
+        "ru": {
+            "label": "Этап DeepSpeed",
+            "info": "Этап DeepSpeed для распределенного обучения.",
+        },
+        "zh": {
+            "label": "DeepSpeed stage",
+            "info": "多卡训练的 DeepSpeed stage。",
+        },
+        "ko": {
+            "label": "DeepSpeed 단계",
+            "info": "분산 학습을 위한 DeepSpeed 단계.",
+        },
+        "ja": {
+            "label": "DeepSpeed stage",
+            "info": "マルチ GPU トレーニングの DeepSpeed stage。",
+        },
+    },
+    "ds_offload": {
+        "en": {
+            "label": "Enable offload",
+            "info": "Enable DeepSpeed offload (slow down training).",
+        },
+        "ru": {
+            "label": "Включить выгрузку",
+            "info": "включить выгрузку DeepSpeed (замедлит обучение).",
+        },
+        "zh": {
+            "label": "使用 offload",
+            "info": "使用 DeepSpeed offload（会减慢速度）。",
+        },
+        "ko": {
+            "label": "오프로딩 활성화",
+            "info": "DeepSpeed 오프로딩 활성화 (훈련 속도 느려짐).",
+        },
+        "ja": {
+            "label": "オフロードを使用",
+            "info": "DeepSpeed オフロードを使用します（速度が遅くなります）。",
+        },
+    },
+    "output_box": {
+        "en": {
+            "value": "Ready.",
+        },
+        "ru": {
+            "value": "Готово.",
+        },
+        "zh": {
+            "value": "准备就绪。",
+        },
+        "ko": {
+            "value": "준비 완료.",
+        },
+        "ja": {
+            "value": "準備完了。",
+        },
+    },
+    "loss_viewer": {
+        "en": {
+            "label": "Loss",
+        },
+        "ru": {
+            "label": "Потери",
+        },
+        "zh": {
+            "label": "损失",
+        },
+        "ko": {
+            "label": "손실",
+        },
+        "ja": {
+            "label": "損失",
+        },
+    },
+    "predict": {
+        "en": {
+            "label": "Save predictions",
+        },
+        "ru": {
+            "label": "Сохранить предсказания",
+        },
+        "zh": {
+            "label": "保存预测结果",
+        },
+        "ko": {
+            "label": "예측 결과 저장",
+        },
+        "ja": {
+            "label": "予測結果を保存",
+        },
+    },
+    "infer_backend": {
+        "en": {
+            "label": "Inference engine",
+        },
+        "ru": {
+            "label": "Инференс движок",
+        },
+        "zh": {
+            "label": "推理引擎",
+        },
+        "ko": {
+            "label": "추론 엔진",
+        },
+        "ja": {
+            "label": "推論エンジン",
+        },
+    },
+    "infer_dtype": {
+        "en": {
+            "label": "Inference data type",
+        },
+        "ru": {
+            "label": "Тип данных для вывода",
+        },
+        "zh": {
+            "label": "推理数据类型",
+        },
+        "ko": {
+            "label": "추론 데이터 유형",
+        },
+        "ja": {
+            "label": "推論データタイプ",
+        },
+    },
+    "load_btn": {
+        "en": {
+            "value": "Load model",
+        },
+        "ru": {
+            "value": "Загрузить модель",
+        },
+        "zh": {
+            "value": "加载模型",
+        },
+        "ko": {
+            "value": "모델 불러오기",
+        },
+        "ja": {
+            "value": "モデルを読み込む",
+        },
+    },
+    "unload_btn": {
+        "en": {
+            "value": "Unload model",
+        },
+        "ru": {
+            "value": "Выгрузить модель",
+        },
+        "zh": {
+            "value": "卸载模型",
+        },
+        "ko": {
+            "value": "모델 언로드",
+        },
+        "ja": {
+            "value": "モデルをアンロード",
+        },
+    },
+    "info_box": {
+        "en": {
+            "value": "Model unloaded, please load a model first.",
+        },
+        "ru": {
+            "value": "Модель не загружена, загрузите модель сначала.",
+        },
+        "zh": {
+            "value": "模型未加载，请先加载模型。",
+        },
+        "ko": {
+            "value": "모델이 언로드되었습니다. 모델을 먼저 불러오십시오.",
+        },
+        "ja": {
+            "value": "モデルがロードされていません。最初にモデルをロードしてください。",
+        },
+    },
+    "role": {
+        "en": {
+            "label": "Role",
+        },
+        "ru": {
+            "label": "Роль",
+        },
+        "zh": {
+            "label": "角色",
+        },
+        "ko": {
+            "label": "역할",
+        },
+        "ja": {
+            "label": "役割",
+        },
+    },
+    "system": {
+        "en": {
+            "placeholder": "System prompt (optional)",
+        },
+        "ru": {
+            "placeholder": "Системный запрос (по желанию)",
+        },
+        "zh": {
+            "placeholder": "系统提示词（非必填）",
+        },
+        "ko": {
+            "placeholder": "시스템 프롬프트 (선택 사항)",
+        },
+        "ja": {
+            "placeholder": "システムプロンプト（オプション）",
+        },
+    },
+    "tools": {
+        "en": {
+            "placeholder": "Tools (optional)",
+        },
+        "ru": {
+            "placeholder": "Инструменты (по желанию)",
+        },
+        "zh": {
+            "placeholder": "工具列表（非必填）",
+        },
+        "ko": {
+            "placeholder": "툴 (선택 사항)",
+        },
+        "ja": {
+            "placeholder": "ツールリスト（オプション）",
+        },
+    },
+    "image": {
+        "en": {
+            "label": "Image (optional)",
+        },
+        "ru": {
+            "label": "Изображение (по желанию)",
+        },
+        "zh": {
+            "label": "图像（非必填）",
+        },
+        "ko": {
+            "label": "이미지 (선택 사항)",
+        },
+        "ja": {
+            "label": "画像（オプション）",
+        },
+    },
+    "video": {
+        "en": {
+            "label": "Video (optional)",
+        },
+        "ru": {
+            "label": "Видео (по желанию)",
+        },
+        "zh": {
+            "label": "视频（非必填）",
+        },
+        "ko": {
+            "label": "비디오 (선택 사항)",
+        },
+        "ja": {
+            "label": "動画（オプション）",
+        },
+    },
+    "query": {
+        "en": {
+            "placeholder": "Input...",
+        },
+        "ru": {
+            "placeholder": "Ввод...",
+        },
+        "zh": {
+            "placeholder": "输入...",
+        },
+        "ko": {
+            "placeholder": "입력...",
+        },
+        "ja": {
+            "placeholder": "入力...",
+        },
+    },
+    "submit_btn": {
+        "en": {
+            "value": "Submit",
+        },
+        "ru": {
+            "value": "Отправить",
+        },
+        "zh": {
+            "value": "提交",
+        },
+        "ko": {
+            "value": "제출",
+        },
+        "ja": {
+            "value": "送信",
+        },
+    },
+    "max_length": {
+        "en": {
+            "label": "Maximum length",
+        },
+        "ru": {
+            "label": "Максимальная длина",
+        },
+        "zh": {
+            "label": "最大长度",
+        },
+        "ko": {
+            "label": "최대 길이",
+        },
+        "ja": {
+            "label": "最大長",
+        },
+    },
+    "max_new_tokens": {
+        "en": {
+            "label": "Maximum new tokens",
+        },
+        "ru": {
+            "label": "Максимальное количество новых токенов",
+        },
+        "zh": {
+            "label": "最大生成长度",
+        },
+        "ko": {
+            "label": "응답의 최대 길이",
+        },
+        "ja": {
+            "label": "最大生成長",
+        },
+    },
+    "top_p": {
+        "en": {
+            "label": "Top-p",
+        },
+        "ru": {
+            "label": "Лучшие-p",
+        },
+        "zh": {
+            "label": "Top-p 采样值",
+        },
+        "ko": {
+            "label": "Top-p",
+        },
+        "ja": {
+            "label": "Top-p",
+        },
+    },
+    "temperature": {
+        "en": {
+            "label": "Temperature",
+        },
+        "ru": {
+            "label": "Температура",
+        },
+        "zh": {
+            "label": "温度系数",
+        },
+        "ko": {
+            "label": "온도",
+        },
+        "ja": {
+            "label": "温度",
+        },
+    },
+    "skip_special_tokens": {
+        "en": {
+            "label": "Skip special tokens",
+        },
+        "ru": {
+            "label": "Пропустить специальные токены",
+        },
+        "zh": {
+            "label": "跳过特殊 token",
+        },
+        "ko": {
+            "label": "스페셜 토큰을 건너뛰기",
+        },
+        "ja": {
+            "label": "スペシャルトークンをスキップ",
+        },
+    },
+    "escape_html": {
+        "en": {
+            "label": "Escape HTML tags",
+        },
+        "ru": {
+            "label": "Исключить HTML теги",
+        },
+        "zh": {
+            "label": "转义 HTML 标签",
+        },
+        "ko": {
+            "label": "HTML 태그 이스케이프",
+        },
+        "ja": {
+            "label": "HTML タグをエスケープ",
+        },
+    },
+    "clear_btn": {
+        "en": {
+            "value": "Clear history",
+        },
+        "ru": {
+            "value": "Очистить историю",
+        },
+        "zh": {
+            "value": "清空历史",
+        },
+        "ko": {
+            "value": "기록 지우기",
+        },
+        "ja": {
+            "value": "履歴をクリア",
+        },
+    },
+    "export_size": {
+        "en": {
+            "label": "Max shard size (GB)",
+            "info": "The maximum size for a model file.",
+        },
+        "ru": {
+            "label": "Максимальный размер фрагмента (ГБ)",
+            "info": "Максимальный размер файла модели.",
+        },
+        "zh": {
+            "label": "最大分块大小（GB）",
+            "info": "单个模型文件的最大大小。",
+        },
+        "ko": {
+            "label": "최대 샤드 크기 (GB)",
+            "info": "모델 파일의 최대 크기.",
+        },
+        "ja": {
+            "label": "最大シャードサイズ（GB）",
+            "info": "単一のモデルファイルの最大サイズ。",
+        },
+    },
+    "export_quantization_bit": {
+        "en": {
+            "label": "Export quantization bit.",
+            "info": "Quantizing the exported model.",
+        },
+        "ru": {
+            "label": "Экспорт бита квантования",
+            "info": "Квантование экспортируемой модели.",
+        },
+        "zh": {
+            "label": "导出量化等级",
+            "info": "量化导出模型。",
+        },
+        "ko": {
+            "label": "양자화 비트 내보내기",
+            "info": "내보낸 모델의 양자화.",
+        },
+        "ja": {
+            "label": "量子化ビットをエクスポート",
+            "info": "エクスポートするモデルを量子化します。",
+        },
+    },
+    "export_quantization_dataset": {
+        "en": {
+            "label": "Export quantization dataset",
+            "info": "The calibration dataset used for quantization.",
+        },
+        "ru": {
+            "label": "Экспорт набора данных для квантования",
+            "info": "Набор данных калибровки, используемый для квантования.",
+        },
+        "zh": {
+            "label": "导出量化数据集",
+            "info": "量化过程中使用的校准数据集。",
+        },
+        "ko": {
+            "label": "양자화 데이터셋 내보내기",
+            "info": "양자화에 사용되는 교정 데이터셋.",
+        },
+        "ja": {
+            "label": "量子化データセットをエクスポート",
+            "info": "量子化プロセスで使用されるキャリブレーションデータセット。",
+        },
+    },
+    "export_device": {
+        "en": {
+            "label": "Export device",
+            "info": "Which device should be used to export model.",
+        },
+        "ru": {
+            "label": "Экспорт устройство",
+            "info": "Какое устройство следует использовать для экспорта модели.",
+        },
+        "zh": {
+            "label": "导出设备",
+            "info": "导出模型使用的设备类型。",
+        },
+        "ko": {
+            "label": "내보낼 장치",
+            "info": "모델을 내보내는 데 사용할 장치.",
+        },
+        "ja": {
+            "label": "エクスポートデバイス",
+            "info": "モデルをエクスポートするために使用するデバイスタイプ。",
+        },
+    },
+    "export_legacy_format": {
+        "en": {
+            "label": "Export legacy format",
+            "info": "Do not use safetensors to save the model.",
+        },
+        "ru": {
+            "label": "Экспорт в устаревший формат",
+            "info": "Не использовать safetensors для сохранения модели.",
+        },
+        "zh": {
+            "label": "导出旧格式",
+            "info": "不使用 safetensors 格式保存模型。",
+        },
+        "ko": {
+            "label": "레거시 형식 내보내기",
+            "info": "모델을 저장하는 데 safetensors를 사용하지 않습니다.",
+        },
+        "ja": {
+            "label": "レガシーフォーマットをエクスポート",
+            "info": "safetensors フォーマットを使用せずにモデルを保存します。",
+        },
+    },
+    "export_dir": {
+        "en": {
+            "label": "Export dir",
+            "info": "Directory to save exported model.",
+        },
+        "ru": {
+            "label": "Каталог экспорта",
+            "info": "Каталог для сохранения экспортированной модели.",
+        },
+        "zh": {
+            "label": "导出目录",
+            "info": "保存导出模型的文件夹路径。",
+        },
+        "ko": {
+            "label": "내보내기 디렉토리",
+            "info": "내보낸 모델을 저장할 디렉토리.",
+        },
+        "ja": {
+            "label": "エクスポートディレクトリ",
+            "info": "エクスポートしたモデルを保存するフォルダのパス。",
+        },
+    },
+    "export_hub_model_id": {
+        "en": {
+            "label": "HF Hub ID (optional)",
+            "info": "Repo ID for uploading model to Hugging Face hub.",
+        },
+        "ru": {
+            "label": "HF Hub ID (опционально)",
+            "info": "Идентификатор репозитория для загрузки модели на Hugging Face hub.",
+        },
+        "zh": {
+            "label": "HF Hub ID（非必填）",
+            "info": "用于将模型上传至 Hugging Face Hub 的仓库 ID。",
+        },
+        "ko": {
+            "label": "HF 허브 ID (선택 사항)",
+            "info": "모델을 Hugging Face 허브에 업로드하기 위한 레포 ID.",
+        },
+        "ja": {
+            "label": "HF Hub ID（オプション）",
+            "info": "Hugging Face Hub にモデルをアップロードするためのリポジトリ ID。",
+        },
+    },
+    "export_btn": {
+        "en": {
+            "value": "Export",
+        },
+        "ru": {
+            "value": "Экспорт",
+        },
+        "zh": {
+            "value": "开始导出",
+        },
+        "ko": {
+            "value": "내보내기",
+        },
+        "ja": {
+            "value": "エクスポート",
+        },
+    },
+    "device_memory": {
+        "en": {
+            "label": "Device memory",
+            "info": "Current memory usage of the device (GB).",
+        },
+        "ru": {
+            "label": "Память устройства",
+            "info": "Текущая память на устройстве (GB).",
+        },
+        "zh": {
+            "label": "设备显存",
+            "info": "当前设备的显存（GB）。",
+        },
+        "ko": {
+            "label": "디바이스 메모리",
+            "info": "지금 사용 중인 기기 메모리 (GB).",
+        },
+        "ja": {
+            "label": "デバイスメモリ",
+            "info": "現在のデバイスのメモリ（GB）。",
+        },
+    },
+}
+
+
+ALERTS = {
+    "err_conflict": {
+        "en": "A process is in running, please abort it first.",
+        "ru": "Процесс уже запущен, пожалуйста, сначала прервите его.",
+        "zh": "任务已存在，请先中断训练。",
+        "ko": "프로세스가 실행 중입니다. 먼저 중단하십시오.",
+        "ja": "プロセスが実行中です。最初に中断してください。",
+    },
+    "err_exists": {
+        "en": "You have loaded a model, please unload it first.",
+        "ru": "Вы загрузили модель, сначала разгрузите ее.",
+        "zh": "模型已存在，请先卸载模型。",
+        "ko": "모델이 로드되었습니다. 먼저 언로드하십시오.",
+        "ja": "モデルがロードされています。最初にアンロードしてください。",
+    },
+    "err_no_model": {
+        "en": "Please select a model.",
+        "ru": "Пожалуйста, выберите модель.",
+        "zh": "请选择模型。",
+        "ko": "모델을 선택하십시오.",
+        "ja": "モデルを選択してください。",
+    },
+    "err_no_path": {
+        "en": "Model not found.",
+        "ru": "Модель не найдена.",
+        "zh": "模型未找到。",
+        "ko": "모델을 찾을 수 없습니다.",
+        "ja": "モデルが見つかりません。",
+    },
+    "err_no_dataset": {
+        "en": "Please choose a dataset.",
+        "ru": "Пожалуйста, выберите набор данных.",
+        "zh": "请选择数据集。",
+        "ko": "데이터 세트를 선택하십시오.",
+        "ja": "データセットを選択してください。",
+    },
+    "err_no_adapter": {
+        "en": "Please select an adapter.",
+        "ru": "Пожалуйста, выберите адаптер.",
+        "zh": "请选择适配器。",
+        "ko": "어댑터를 선택하십시오.",
+        "ja": "アダプターを選択してください。",
+    },
+    "err_no_output_dir": {
+        "en": "Please provide output dir.",
+        "ru": "Пожалуйста, укажите выходную директорию.",
+        "zh": "请填写输出目录。",
+        "ko": "출력 디렉토리를 제공하십시오.",
+        "ja": "出力ディレクトリを入力してください。",
+    },
+    "err_no_reward_model": {
+        "en": "Please select a reward model.",
+        "ru": "Пожалуйста, выберите модель вознаграждения.",
+        "zh": "请选择奖励模型。",
+        "ko": "리워드 모델을 선택하십시오.",
+        "ja": "報酬モデルを選択してください。",
+    },
+    "err_no_export_dir": {
+        "en": "Please provide export dir.",
+        "ru": "Пожалуйста, укажите каталог для экспорта.",
+        "zh": "请填写导出目录。",
+        "ko": "Export 디렉토리를 제공하십시오.",
+        "ja": "エクスポートディレクトリを入力してください。",
+    },
+    "err_gptq_lora": {
+        "en": "Please merge adapters before quantizing the model.",
+        "ru": "Пожалуйста, объедините адаптеры перед квантованием модели.",
+        "zh": "量化模型前请先合并适配器。",
+        "ko": "모델을 양자화하기 전에 어댑터를 병합하십시오.",
+        "ja": "モデルを量子化する前にアダプターをマージしてください。",
+    },
+    "err_failed": {
+        "en": "Failed.",
+        "ru": "Ошибка.",
+        "zh": "训练出错。",
+        "ko": "실패했습니다.",
+        "ja": "失敗しました。",
+    },
+    "err_demo": {
+        "en": "Training is unavailable in demo mode, duplicate the space to a private one first.",
+        "ru": "Обучение недоступно в демонстрационном режиме, сначала скопируйте пространство в частное.",
+        "zh": "展示模式不支持训练，请先复制到私人空间。",
+        "ko": "데모 모드에서는 훈련을 사용할 수 없습니다. 먼저 프라이빗 레포지토리로 작업 공간을 복제하십시오.",
+        "ja": "デモモードではトレーニングは利用できません。最初にプライベートスペースに複製してください。",
+    },
+    "err_tool_name": {
+        "en": "Tool name not found.",
+        "ru": "Имя инструмента не найдено.",
+        "zh": "工具名称未找到。",
+        "ko": "툴 이름을 찾을 수 없습니다.",
+        "ja": "ツール名が見つかりません。",
+    },
+    "err_json_schema": {
+        "en": "Invalid JSON schema.",
+        "ru": "Неверная схема JSON.",
+        "zh": "Json 格式错误。",
+        "ko": "잘못된 JSON 스키마입니다.",
+        "ja": "JSON スキーマが無効です。",
+    },
+    "err_config_not_found": {
+        "en": "Config file is not found.",
+        "ru": "Файл конфигурации не найден.",
+        "zh": "未找到配置文件。",
+        "ko": "Config 파일을 찾을 수 없습니다.",
+        "ja": "設定ファイルが見つかりません。",
+    },
+    "warn_no_cuda": {
+        "en": "CUDA environment was not detected.",
+        "ru": "Среда CUDA не обнаружена.",
+        "zh": "未检测到 CUDA 环境。",
+        "ko": "CUDA 환경이 감지되지 않았습니다.",
+        "ja": "CUDA 環境が検出されませんでした。",
+    },
+    "warn_output_dir_exists": {
+        "en": "Output dir already exists, will resume training from here.",
+        "ru": "Выходной каталог уже существует, обучение будет продолжено отсюда.",
+        "zh": "输出目录已存在，将从该断点恢复训练。",
+        "ko": "출력 디렉토리가 이미 존재합니다. 위 출력 디렉토리에 저장된 학습을 재개합니다.",
+        "ja": "出力ディレクトリが既に存在します。このチェックポイントからトレーニングを再開します。",
+    },
+    "warn_no_instruct": {
+        "en": "You are using a non-instruct model, please fine-tune it first.",
+        "ru": "Вы используете модель без инструкции, пожалуйста, primeros выполните донастройку этой модели.",
+        "zh": "您正在使用非指令模型，请先对其进行微调。",
+        "ko": "당신은 지시하지 않은 모델을 사용하고 있습니다. 먼저 이를 미세 조정해 주세요.",
+        "ja": "インストラクションモデルを使用していません。まずモデルをアダプターに適合させてください。",
+    },
+    "info_aborting": {
+        "en": "Aborted, wait for terminating...",
+        "ru": "Прервано, ожидание завершения...",
+        "zh": "训练中断，正在等待进程结束……",
+        "ko": "중단되었습니다. 종료를 기다리십시오...",
+        "ja": "トレーニングが中断されました。プロセスの終了を待っています...",
+    },
+    "info_aborted": {
+        "en": "Ready.",
+        "ru": "Готово.",
+        "zh": "准备就绪。",
+        "ko": "준비되었습니다.",
+        "ja": "準備完了。",
+    },
+    "info_finished": {
+        "en": "Finished.",
+        "ru": "Завершено.",
+        "zh": "训练完毕。",
+        "ko": "완료되었습니다.",
+        "ja": "トレーニングが完了しました。",
+    },
+    "info_config_saved": {
+        "en": "Arguments have been saved at: ",
+        "ru": "Аргументы были сохранены по адресу: ",
+        "zh": "训练参数已保存至：",
+        "ko": "매개변수가 저장되었습니다: ",
+        "ja": "トレーニングパラメータが保存されました: ",
+    },
+    "info_config_loaded": {
+        "en": "Arguments have been restored.",
+        "ru": "Аргументы были восстановлены.",
+        "zh": "训练参数已载入。",
+        "ko": "매개변수가 복원되었습니다.",
+        "ja": "トレーニングパラメータが読み込まれました。",
+    },
+    "info_loading": {
+        "en": "Loading model...",
+        "ru": "Загрузка модели...",
+        "zh": "加载中……",
+        "ko": "모델 로딩 중...",
+        "ja": "モデルをロード中...",
+    },
+    "info_unloading": {
+        "en": "Unloading model...",
+        "ru": "Выгрузка модели...",
+        "zh": "卸载中……",
+        "ko": "모델 언로딩 중...",
+        "ja": "モデルをアンロード中...",
+    },
+    "info_loaded": {
+        "en": "Model loaded, now you can chat with your model!",
+        "ru": "Модель загружена, теперь вы можете общаться с вашей моделью!",
+        "zh": "模型已加载，可以开始聊天了！",
+        "ko": "모델이 로드되었습니다. 이제 모델과 채팅할 수 있습니다!",
+        "ja": "モデルがロードされました。チャットを開始できます！",
+    },
+    "info_unloaded": {
+        "en": "Model unloaded.",
+        "ru": "Модель выгружена.",
+        "zh": "模型已卸载。",
+        "ko": "모델이 언로드되었습니다.",
+        "ja": "モデルがアンロードされました。",
+    },
+    "info_thinking": {
+        "en": "🌀 Thinking...",
+        "ru": "🌀 Думаю...",
+        "zh": "🌀 思考中...",
+        "ko": "🌀 생각 중...",
+        "ja": "🌀 考えています...",
+    },
+    "info_thought": {
+        "en": "✅ Thought",
+        "ru": "✅ Думать закончено",
+        "zh": "✅ 思考完成",
+        "ko": "✅ 생각이 완료되었습니다",
+        "ja": "✅ 思考完了",
+    },
+    "info_exporting": {
+        "en": "Exporting model...",
+        "ru": "Экспорт модели...",
+        "zh": "正在导出模型……",
+        "ko": "모델 내보내기 중...",
+        "ja": "モデルをエクスポート中...",
+    },
+    "info_exported": {
+        "en": "Model exported.",
+        "ru": "Модель экспортирована.",
+        "zh": "模型导出完成。",
+        "ko": "모델이 내보내졌습니다.",
+        "ja": "モデルのエクスポートが完了しました。",
+    },
+    "info_swanlab_link": {
+        "en": "### SwanLab Link\n",
+        "ru": "### SwanLab ссылка\n",
+        "zh": "### SwanLab 链接\n",
+        "ko": "### SwanLab 링크\n",
+        "ja": "### SwanLab リンク\n",
+    },
+}
diff --git a/LlamaFactory/src/llamafactory/webui/manager.py b/LlamaFactory/src/llamafactory/webui/manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..e762fa6b5e427a5b0a77e4faa7e28f413c243863
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/manager.py
@@ -0,0 +1,70 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+
+class Manager:
+    r"""A class to manage all the gradio components in Web UI."""
+
+    def __init__(self) -> None:
+        self._id_to_elem: dict[str, Component] = {}
+        self._elem_to_id: dict[Component, str] = {}
+
+    def add_elems(self, tab_name: str, elem_dict: dict[str, "Component"]) -> None:
+        r"""Add elements to manager."""
+        for elem_name, elem in elem_dict.items():
+            elem_id = f"{tab_name}.{elem_name}"
+            self._id_to_elem[elem_id] = elem
+            self._elem_to_id[elem] = elem_id
+
+    def get_elem_list(self) -> list["Component"]:
+        r"""Return the list of all elements."""
+        return list(self._id_to_elem.values())
+
+    def get_elem_iter(self) -> Generator[tuple[str, "Component"], None, None]:
+        r"""Return an iterator over all elements with their names."""
+        for elem_id, elem in self._id_to_elem.items():
+            yield elem_id.split(".")[-1], elem
+
+    def get_elem_by_id(self, elem_id: str) -> "Component":
+        r"""Get element by id.
+
+        Example: top.lang, train.dataset
+        """
+        return self._id_to_elem[elem_id]
+
+    def get_id_by_elem(self, elem: "Component") -> str:
+        r"""Get id by element."""
+        return self._elem_to_id[elem]
+
+    def get_base_elems(self) -> set["Component"]:
+        r"""Get the base elements that are commonly used."""
+        return {
+            self._id_to_elem["top.lang"],
+            self._id_to_elem["top.model_name"],
+            self._id_to_elem["top.model_path"],
+            self._id_to_elem["top.finetuning_type"],
+            self._id_to_elem["top.checkpoint_path"],
+            self._id_to_elem["top.quantization_bit"],
+            self._id_to_elem["top.quantization_method"],
+            self._id_to_elem["top.template"],
+            self._id_to_elem["top.rope_scaling"],
+            self._id_to_elem["top.booster"],
+        }
diff --git a/LlamaFactory/src/llamafactory/webui/runner.py b/LlamaFactory/src/llamafactory/webui/runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c772f5dc4b1ffd6b94c90284e9b6d006feec0bb
--- /dev/null
+++ b/LlamaFactory/src/llamafactory/webui/runner.py
@@ -0,0 +1,505 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections.abc import Generator
+from copy import deepcopy
+from subprocess import PIPE, Popen, TimeoutExpired
+from typing import TYPE_CHECKING, Any
+
+from transformers.utils import is_torch_npu_available
+
+from ..extras.constants import LLAMABOARD_CONFIG, MULTIMODAL_SUPPORTED_MODELS, PEFT_METHODS, TRAINING_STAGES
+from ..extras.misc import is_accelerator_available, torch_gc
+from ..extras.packages import is_gradio_available
+from .common import (
+    DEFAULT_CACHE_DIR,
+    DEFAULT_CONFIG_DIR,
+    abort_process,
+    calculate_pixels,
+    gen_cmd,
+    get_save_dir,
+    load_args,
+    load_config,
+    load_eval_results,
+    save_args,
+    save_cmd,
+)
+from .control import get_trainer_info
+from .locales import ALERTS, LOCALES
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from .manager import Manager
+
+
+class Runner:
+    r"""A class to manage the running status of the trainers."""
+
+    def __init__(self, manager: "Manager", demo_mode: bool = False) -> None:
+        r"""Init a runner."""
+        self.manager = manager
+        self.demo_mode = demo_mode
+        """ Resume """
+        self.trainer: Popen | None = None
+        self.do_train = True
+        self.running_data: dict[Component, Any] = None
+        """ State """
+        self.aborted = False
+        self.running = False
+
+    def set_abort(self) -> None:
+        self.aborted = True
+        if self.trainer is not None:
+            abort_process(self.trainer.pid)
+
+    def _initialize(self, data: dict["Component", Any], do_train: bool, from_preview: bool) -> str:
+        r"""Validate the configuration."""
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
+        lang, model_name, model_path = get("top.lang"), get("top.model_name"), get("top.model_path")
+        dataset = get("train.dataset") if do_train else get("eval.dataset")
+
+        if self.running:
+            return ALERTS["err_conflict"][lang]
+
+        if not model_name:
+            return ALERTS["err_no_model"][lang]
+
+        if not model_path:
+            return ALERTS["err_no_path"][lang]
+
+        if not dataset:
+            return ALERTS["err_no_dataset"][lang]
+
+        if not from_preview and self.demo_mode:
+            return ALERTS["err_demo"][lang]
+
+        if do_train:
+            if not get("train.output_dir"):
+                return ALERTS["err_no_output_dir"][lang]
+
+            try:
+                json.loads(get("train.extra_args"))
+            except json.JSONDecodeError:
+                return ALERTS["err_json_schema"][lang]
+
+            stage = TRAINING_STAGES[get("train.training_stage")]
+            if stage == "ppo" and not get("train.reward_model"):
+                return ALERTS["err_no_reward_model"][lang]
+        else:
+            if not get("eval.output_dir"):
+                return ALERTS["err_no_output_dir"][lang]
+
+        if not from_preview and not is_accelerator_available():
+            gr.Warning(ALERTS["warn_no_cuda"][lang])
+
+        return ""
+
+    def _finalize(self, lang: str, finish_info: str) -> None:
+        r"""Clean the cached memory and resets the runner."""
+        finish_info = ALERTS["info_aborted"][lang] if self.aborted else finish_info
+        gr.Info(finish_info)
+        self.trainer = None
+        self.aborted = False
+        self.running = False
+        self.running_data = None
+        torch_gc()
+
+    def _parse_train_args(self, data: dict["Component", Any]) -> dict[str, Any]:
+        r"""Build and validate the training arguments."""
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
+        model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
+        user_config = load_config()
+
+        args = dict(
+            stage=TRAINING_STAGES[get("train.training_stage")],
+            do_train=True,
+            model_name_or_path=get("top.model_path"),
+            cache_dir=user_config.get("cache_dir", None),
+            preprocessing_num_workers=16,
+            finetuning_type=finetuning_type,
+            template=get("top.template"),
+            rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") != "none" else None,
+            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
+            use_unsloth=(get("top.booster") == "unsloth"),
+            enable_liger_kernel=(get("top.booster") == "liger_kernel"),
+            dataset_dir=get("train.dataset_dir"),
+            dataset=",".join(get("train.dataset")),
+            cutoff_len=get("train.cutoff_len"),
+            learning_rate=float(get("train.learning_rate")),
+            num_train_epochs=float(get("train.num_train_epochs")),
+            max_samples=int(get("train.max_samples")),
+            per_device_train_batch_size=get("train.batch_size"),
+            gradient_accumulation_steps=get("train.gradient_accumulation_steps"),
+            lr_scheduler_type=get("train.lr_scheduler_type"),
+            max_grad_norm=float(get("train.max_grad_norm")),
+            logging_steps=get("train.logging_steps"),
+            save_steps=get("train.save_steps"),
+            warmup_steps=get("train.warmup_steps"),
+            neftune_noise_alpha=get("train.neftune_alpha") or None,
+            packing=get("train.packing") or get("train.neat_packing"),
+            neat_packing=get("train.neat_packing"),
+            train_on_prompt=get("train.train_on_prompt"),
+            mask_history=get("train.mask_history"),
+            resize_vocab=get("train.resize_vocab"),
+            use_llama_pro=get("train.use_llama_pro"),
+            enable_thinking=get("train.enable_thinking"),
+            report_to=get("train.report_to"),
+            use_galore=get("train.use_galore"),
+            use_apollo=get("train.use_apollo"),
+            use_badam=get("train.use_badam"),
+            use_swanlab=get("train.use_swanlab"),
+            output_dir=get_save_dir(model_name, finetuning_type, get("train.output_dir")),
+            fp16=(get("train.compute_type") == "fp16"),
+            bf16=(get("train.compute_type") == "bf16"),
+            pure_bf16=(get("train.compute_type") == "pure_bf16"),
+            plot_loss=True,
+            trust_remote_code=True,
+            ddp_timeout=180000000,
+            include_num_input_tokens_seen=True,
+        )
+        args.update(json.loads(get("train.extra_args")))
+
+        # checkpoints
+        if get("top.checkpoint_path"):
+            if finetuning_type in PEFT_METHODS:  # list
+                args["adapter_name_or_path"] = ",".join(
+                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("top.checkpoint_path")]
+                )
+            else:  # str
+                args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, get("top.checkpoint_path"))
+
+        # quantization
+        if get("top.quantization_bit") != "none":
+            args["quantization_bit"] = int(get("top.quantization_bit"))
+            args["quantization_method"] = get("top.quantization_method")
+            args["double_quantization"] = not is_torch_npu_available()
+
+        # freeze config
+        if args["finetuning_type"] == "freeze":
+            args["freeze_trainable_layers"] = get("train.freeze_trainable_layers")
+            args["freeze_trainable_modules"] = get("train.freeze_trainable_modules")
+            args["freeze_extra_modules"] = get("train.freeze_extra_modules") or None
+
+        # lora config
+        if args["finetuning_type"] == "lora":
+            args["lora_rank"] = get("train.lora_rank")
+            args["lora_alpha"] = get("train.lora_alpha")
+            args["lora_dropout"] = get("train.lora_dropout")
+            args["loraplus_lr_ratio"] = get("train.loraplus_lr_ratio") or None
+            args["create_new_adapter"] = get("train.create_new_adapter")
+            args["use_rslora"] = get("train.use_rslora")
+            args["use_dora"] = get("train.use_dora")
+            args["pissa_init"] = get("train.use_pissa")
+            args["pissa_convert"] = get("train.use_pissa")
+            args["lora_target"] = get("train.lora_target") or "all"
+            args["additional_target"] = get("train.additional_target") or None
+
+            if args["use_llama_pro"]:
+                args["freeze_trainable_layers"] = get("train.freeze_trainable_layers")
+
+        # rlhf config
+        if args["stage"] == "ppo":
+            if finetuning_type in PEFT_METHODS:
+                args["reward_model"] = ",".join(
+                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("train.reward_model")]
+                )
+            else:
+                args["reward_model"] = get_save_dir(model_name, finetuning_type, get("train.reward_model"))
+
+            args["reward_model_type"] = "lora" if finetuning_type == "lora" else "full"
+            args["ppo_score_norm"] = get("train.ppo_score_norm")
+            args["ppo_whiten_rewards"] = get("train.ppo_whiten_rewards")
+            args["top_k"] = 0
+            args["top_p"] = 0.9
+        elif args["stage"] in ["dpo", "kto"]:
+            args["pref_beta"] = get("train.pref_beta")
+            args["pref_ftx"] = get("train.pref_ftx")
+            args["pref_loss"] = get("train.pref_loss")
+
+        # multimodal config
+        if model_name in MULTIMODAL_SUPPORTED_MODELS:
+            args["freeze_vision_tower"] = get("train.freeze_vision_tower")
+            args["freeze_multi_modal_projector"] = get("train.freeze_multi_modal_projector")
+            args["freeze_language_model"] = get("train.freeze_language_model")
+            args["image_max_pixels"] = calculate_pixels(get("train.image_max_pixels"))
+            args["image_min_pixels"] = calculate_pixels(get("train.image_min_pixels"))
+            args["video_max_pixels"] = calculate_pixels(get("train.video_max_pixels"))
+            args["video_min_pixels"] = calculate_pixels(get("train.video_min_pixels"))
+
+        # galore config
+        if args["use_galore"]:
+            args["galore_rank"] = get("train.galore_rank")
+            args["galore_update_interval"] = get("train.galore_update_interval")
+            args["galore_scale"] = get("train.galore_scale")
+            args["galore_target"] = get("train.galore_target")
+
+        # apollo config
+        if args["use_apollo"]:
+            args["apollo_rank"] = get("train.apollo_rank")
+            args["apollo_update_interval"] = get("train.apollo_update_interval")
+            args["apollo_scale"] = get("train.apollo_scale")
+            args["apollo_target"] = get("train.apollo_target")
+
+        # badam config
+        if args["use_badam"]:
+            args["badam_mode"] = get("train.badam_mode")
+            args["badam_switch_mode"] = get("train.badam_switch_mode")
+            args["badam_switch_interval"] = get("train.badam_switch_interval")
+            args["badam_update_ratio"] = get("train.badam_update_ratio")
+
+        # swanlab config
+        if get("train.use_swanlab"):
+            args["swanlab_project"] = get("train.swanlab_project")
+            args["swanlab_run_name"] = get("train.swanlab_run_name")
+            args["swanlab_workspace"] = get("train.swanlab_workspace")
+            args["swanlab_api_key"] = get("train.swanlab_api_key")
+            args["swanlab_mode"] = get("train.swanlab_mode")
+
+        # eval config
+        if get("train.val_size") > 1e-6 and args["stage"] != "ppo":
+            args["val_size"] = get("train.val_size")
+            args["eval_strategy"] = "steps"
+            args["eval_steps"] = args["save_steps"]
+            args["per_device_eval_batch_size"] = args["per_device_train_batch_size"]
+
+        # ds config
+        if get("train.ds_stage") != "none":
+            ds_stage = get("train.ds_stage")
+            ds_offload = "offload_" if get("train.ds_offload") else ""
+            args["deepspeed"] = os.path.join(DEFAULT_CACHE_DIR, f"ds_z{ds_stage}_{ds_offload}config.json")
+
+        return args
+
+    def _parse_eval_args(self, data: dict["Component", Any]) -> dict[str, Any]:
+        r"""Build and validate the evaluation arguments."""
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
+        model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
+        user_config = load_config()
+
+        args = dict(
+            stage="sft",
+            model_name_or_path=get("top.model_path"),
+            cache_dir=user_config.get("cache_dir", None),
+            preprocessing_num_workers=16,
+            finetuning_type=finetuning_type,
+            quantization_method=get("top.quantization_method"),
+            template=get("top.template"),
+            rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") != "none" else None,
+            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
+            use_unsloth=(get("top.booster") == "unsloth"),
+            dataset_dir=get("eval.dataset_dir"),
+            eval_dataset=",".join(get("eval.dataset")),
+            cutoff_len=get("eval.cutoff_len"),
+            max_samples=int(get("eval.max_samples")),
+            per_device_eval_batch_size=get("eval.batch_size"),
+            predict_with_generate=True,
+            report_to="none",
+            max_new_tokens=get("eval.max_new_tokens"),
+            top_p=get("eval.top_p"),
+            temperature=get("eval.temperature"),
+            output_dir=get_save_dir(model_name, finetuning_type, get("eval.output_dir")),
+            trust_remote_code=True,
+            ddp_timeout=180000000,
+        )
+
+        if get("eval.predict"):
+            args["do_predict"] = True
+        else:
+            args["do_eval"] = True
+
+        # checkpoints
+        if get("top.checkpoint_path"):
+            if finetuning_type in PEFT_METHODS:  # list
+                args["adapter_name_or_path"] = ",".join(
+                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("top.checkpoint_path")]
+                )
+            else:  # str
+                args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, get("top.checkpoint_path"))
+
+        # quantization
+        if get("top.quantization_bit") != "none":
+            args["quantization_bit"] = int(get("top.quantization_bit"))
+            args["quantization_method"] = get("top.quantization_method")
+            args["double_quantization"] = not is_torch_npu_available()
+
+        return args
+
+    def _preview(self, data: dict["Component", Any], do_train: bool) -> Generator[dict["Component", str], None, None]:
+        r"""Preview the training commands."""
+        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval"))
+        error = self._initialize(data, do_train, from_preview=True)
+        if error:
+            gr.Warning(error)
+            yield {output_box: error}
+        else:
+            args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
+            yield {output_box: gen_cmd(args)}
+
+    def _launch(self, data: dict["Component", Any], do_train: bool) -> Generator[dict["Component", Any], None, None]:
+        r"""Start the training process."""
+        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval"))
+        error = self._initialize(data, do_train, from_preview=False)
+        if error:
+            gr.Warning(error)
+            yield {output_box: error}
+        else:
+            self.do_train, self.running_data = do_train, data
+            args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
+
+            os.makedirs(args["output_dir"], exist_ok=True)
+            save_args(os.path.join(args["output_dir"], LLAMABOARD_CONFIG), self._build_config_dict(data))
+
+            env = deepcopy(os.environ)
+            env["LLAMABOARD_ENABLED"] = "1"
+            env["LLAMABOARD_WORKDIR"] = args["output_dir"]
+            if args.get("deepspeed", None) is not None:
+                env["FORCE_TORCHRUN"] = "1"
+
+            # NOTE: DO NOT USE shell=True to avoid security risk
+            self.trainer = Popen(["llamafactory-cli", "train", save_cmd(args)], env=env, stderr=PIPE, text=True)
+            yield from self.monitor()
+
+    def _build_config_dict(self, data: dict["Component", Any]) -> dict[str, Any]:
+        r"""Build a dictionary containing the current training configuration."""
+        config_dict = {}
+        skip_ids = ["top.lang", "top.model_path", "train.output_dir", "train.config_path"]
+        for elem, value in data.items():
+            elem_id = self.manager.get_id_by_elem(elem)
+            if elem_id not in skip_ids:
+                config_dict[elem_id] = value
+
+        return config_dict
+
+    def preview_train(self, data):
+        yield from self._preview(data, do_train=True)
+
+    def preview_eval(self, data):
+        yield from self._preview(data, do_train=False)
+
+    def run_train(self, data):
+        yield from self._launch(data, do_train=True)
+
+    def run_eval(self, data):
+        yield from self._launch(data, do_train=False)
+
+    def monitor(self):
+        r"""Monitorgit the training progress and logs."""
+        self.aborted = False
+        self.running = True
+
+        get = lambda elem_id: self.running_data[self.manager.get_elem_by_id(elem_id)]
+        lang, model_name, finetuning_type = get("top.lang"), get("top.model_name"), get("top.finetuning_type")
+        output_dir = get("{}.output_dir".format("train" if self.do_train else "eval"))
+        output_path = get_save_dir(model_name, finetuning_type, output_dir)
+
+        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if self.do_train else "eval"))
+        progress_bar = self.manager.get_elem_by_id("{}.progress_bar".format("train" if self.do_train else "eval"))
+        loss_viewer = self.manager.get_elem_by_id("train.loss_viewer") if self.do_train else None
+        swanlab_link = self.manager.get_elem_by_id("train.swanlab_link") if self.do_train else None
+
+        running_log = ""
+        return_code = -1
+        while return_code == -1:
+            if self.aborted:
+                yield {
+                    output_box: ALERTS["info_aborting"][lang],
+                    progress_bar: gr.Slider(visible=False),
+                }
+            else:
+                running_log, running_progress, running_info = get_trainer_info(lang, output_path, self.do_train)
+                return_dict = {
+                    output_box: running_log,
+                    progress_bar: running_progress,
+                }
+                if "loss_viewer" in running_info:
+                    return_dict[loss_viewer] = running_info["loss_viewer"]
+
+                if "swanlab_link" in running_info:
+                    return_dict[swanlab_link] = running_info["swanlab_link"]
+
+                yield return_dict
+
+            try:
+                stderr = self.trainer.communicate(timeout=2)[1]
+                return_code = self.trainer.returncode
+            except TimeoutExpired:
+                continue
+
+        if return_code == 0 or self.aborted:
+            finish_info = ALERTS["info_finished"][lang]
+            if self.do_train:
+                finish_log = ALERTS["info_finished"][lang] + "\n\n" + running_log
+            else:
+                finish_log = load_eval_results(os.path.join(output_path, "all_results.json")) + "\n\n" + running_log
+        else:
+            print(stderr)
+            finish_info = ALERTS["err_failed"][lang]
+            finish_log = ALERTS["err_failed"][lang] + f" Exit code: {return_code}\n\n```\n{stderr}\n```\n"
+
+        self._finalize(lang, finish_info)
+        return_dict = {output_box: finish_log, progress_bar: gr.Slider(visible=False)}
+        yield return_dict
+
+    def save_args(self, data):
+        r"""Save the training configuration to config path."""
+        output_box = self.manager.get_elem_by_id("train.output_box")
+        error = self._initialize(data, do_train=True, from_preview=True)
+        if error:
+            gr.Warning(error)
+            return {output_box: error}
+
+        lang = data[self.manager.get_elem_by_id("top.lang")]
+        config_path = data[self.manager.get_elem_by_id("train.config_path")]
+        os.makedirs(DEFAULT_CONFIG_DIR, exist_ok=True)
+        save_path = os.path.join(DEFAULT_CONFIG_DIR, config_path)
+
+        save_args(save_path, self._build_config_dict(data))
+        return {output_box: ALERTS["info_config_saved"][lang] + save_path}
+
+    def load_args(self, lang: str, config_path: str):
+        r"""Load the training configuration from config path."""
+        output_box = self.manager.get_elem_by_id("train.output_box")
+        config_dict = load_args(os.path.join(DEFAULT_CONFIG_DIR, config_path))
+        if config_dict is None:
+            gr.Warning(ALERTS["err_config_not_found"][lang])
+            return {output_box: ALERTS["err_config_not_found"][lang]}
+
+        output_dict: dict[Component, Any] = {output_box: ALERTS["info_config_loaded"][lang]}
+        for elem_id, value in config_dict.items():
+            output_dict[self.manager.get_elem_by_id(elem_id)] = value
+
+        return output_dict
+
+    def check_output_dir(self, lang: str, model_name: str, finetuning_type: str, output_dir: str):
+        r"""Restore the training status if output_dir exists."""
+        output_box = self.manager.get_elem_by_id("train.output_box")
+        output_dict: dict[Component, Any] = {output_box: LOCALES["output_box"][lang]["value"]}
+        if model_name and output_dir and os.path.isdir(get_save_dir(model_name, finetuning_type, output_dir)):
+            gr.Warning(ALERTS["warn_output_dir_exists"][lang])
+            output_dict[output_box] = ALERTS["warn_output_dir_exists"][lang]
+
+            output_dir = get_save_dir(model_name, finetuning_type, output_dir)
+            config_dict = load_args(os.path.join(output_dir, LLAMABOARD_CONFIG))  # load llamaboard config
+            for elem_id, value in config_dict.items():
+                output_dict[self.manager.get_elem_by_id(elem_id)] = value
+
+        return output_dict
diff --git a/LlamaFactory/src/train.py b/LlamaFactory/src/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba846a024a72853948ee2757e69b44551c63b12
--- /dev/null
+++ b/LlamaFactory/src/train.py
@@ -0,0 +1,28 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from llamafactory.train.tuner import run_exp
+
+
+def main():
+    run_exp()
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    run_exp()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/LlamaFactory/src/webui.py b/LlamaFactory/src/webui.py
new file mode 100644
index 0000000000000000000000000000000000000000..f13d2f26c30dd259baf6394d9b293e87a7664450
--- /dev/null
+++ b/LlamaFactory/src/webui.py
@@ -0,0 +1,31 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from llamafactory.extras.misc import fix_proxy, is_env_enabled
+from llamafactory.webui.interface import create_ui
+
+
+def main():
+    gradio_ipv6 = is_env_enabled("GRADIO_IPV6")
+    gradio_share = is_env_enabled("GRADIO_SHARE")
+    server_name = os.getenv("GRADIO_SERVER_NAME", "[::]" if gradio_ipv6 else "0.0.0.0")
+    print("Visit http://ip:port for Web UI, e.g., http://127.0.0.1:7860")
+    fix_proxy(ipv6_enabled=gradio_ipv6)
+    create_ui().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/LlamaFactory/tests/check_license.py b/LlamaFactory/tests/check_license.py
new file mode 100644
index 0000000000000000000000000000000000000000..1512347d92bc6ace6b53200899c9c8df87f5edb5
--- /dev/null
+++ b/LlamaFactory/tests/check_license.py
@@ -0,0 +1,38 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from pathlib import Path
+
+
+KEYWORDS = ("Copyright", "2025", "LlamaFactory")
+
+
+def main():
+    path_list: list[Path] = []
+    for check_dir in sys.argv[1:]:
+        path_list.extend(Path(check_dir).glob("**/*.py"))
+
+    for path in path_list:
+        with open(path.absolute(), encoding="utf-8") as f:
+            file_content = f.read().strip().split("\n")
+            if not file_content[0]:
+                continue
+
+            print(f"Check license: {path}")
+            assert all(keyword in file_content[0] for keyword in KEYWORDS), f"File {path} does not contain license."
+
+
+if __name__ == "__main__":
+    main()
diff --git a/LlamaFactory/tests/conftest.py b/LlamaFactory/tests/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..976b48ccc56b5375284e9cee575637a4b814930d
--- /dev/null
+++ b/LlamaFactory/tests/conftest.py
@@ -0,0 +1,168 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""LlamaFactory test configuration.
+
+Contains shared fixtures, pytest configuration, and custom markers.
+"""
+
+import os
+
+import pytest
+import torch
+import torch.distributed as dist
+from pytest import Config, FixtureRequest, Item, MonkeyPatch
+
+from llamafactory.extras.misc import get_current_device, get_device_count, is_env_enabled
+from llamafactory.extras.packages import is_transformers_version_greater_than
+from llamafactory.train.test_utils import patch_valuehead_model
+
+
+CURRENT_DEVICE = get_current_device().type
+
+
+def pytest_configure(config: Config):
+    """Register custom pytest markers."""
+    config.addinivalue_line(
+        "markers",
+        "slow: marks tests as slow (deselect with '-m \"not slow\"' or set RUN_SLOW=1 to run)",
+    )
+    config.addinivalue_line(
+        "markers",
+        "runs_on: test requires specific device type, e.g., @pytest.mark.runs_on(['cuda'])",
+    )
+    config.addinivalue_line(
+        "markers",
+        "require_distributed(num_devices): allow multi-device execution (default: 2)",
+    )
+
+
+def _handle_runs_on(items: list[Item]):
+    """Skip tests on specified device TYPES (cpu/cuda/npu)."""
+    for item in items:
+        marker = item.get_closest_marker("runs_on")
+        if not marker:
+            continue
+
+        devices = marker.args[0]
+        if isinstance(devices, str):
+            devices = [devices]
+
+        if CURRENT_DEVICE not in devices:
+            item.add_marker(pytest.mark.skip(reason=f"test requires one of {devices} (current: {CURRENT_DEVICE})"))
+
+
+def _handle_slow_tests(items: list[Item]):
+    """Skip slow tests unless RUN_SLOW is enabled."""
+    if not is_env_enabled("RUN_SLOW"):
+        skip_slow = pytest.mark.skip(reason="slow test (set RUN_SLOW=1 to run)")
+        for item in items:
+            if "slow" in item.keywords:
+                item.add_marker(skip_slow)
+
+
+def _get_visible_devices_env() -> str | None:
+    """Return device visibility env var name."""
+    if CURRENT_DEVICE == "cuda":
+        return "CUDA_VISIBLE_DEVICES"
+    elif CURRENT_DEVICE == "npu":
+        return "ASCEND_RT_VISIBLE_DEVICES"
+    else:
+        return None
+
+
+def _handle_device_visibility(items: list[Item]):
+    """Handle device visibility based on test markers."""
+    env_key = _get_visible_devices_env()
+    if env_key is None or CURRENT_DEVICE in ("cpu", "mps"):
+        return
+
+    # Parse visible devices
+    visible_devices_env = os.environ.get(env_key)
+    if visible_devices_env is None:
+        available = get_device_count()
+    else:
+        visible_devices = [v for v in visible_devices_env.split(",") if v != ""]
+        available = len(visible_devices)
+
+    for item in items:
+        marker = item.get_closest_marker("require_distributed")
+        if not marker:
+            continue
+
+        required = marker.args[0] if marker.args else 2
+        if available < required:
+            item.add_marker(pytest.mark.skip(reason=f"test requires {required} devices, but only {available} visible"))
+
+
+def pytest_collection_modifyitems(config: Config, items: list[Item]):
+    """Modify test collection based on markers and environment."""
+    # Handle version compatibility (from HEAD)
+    skip_bc = pytest.mark.skip(reason="Skip backward compatibility tests")
+    for item in items:
+        if "tests_v1" in str(item.fspath) and not is_transformers_version_greater_than("4.57.0"):
+            item.add_marker(skip_bc)
+
+    _handle_slow_tests(items)
+    _handle_runs_on(items)
+    _handle_device_visibility(items)
+
+
+@pytest.fixture(autouse=True)
+def _cleanup_distributed_state():
+    """Cleanup distributed state after each test."""
+    yield
+    if dist.is_initialized():
+        dist.destroy_process_group()
+
+
+@pytest.fixture(autouse=True)
+def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) -> None:
+    """Set environment variables for distributed tests if specific devices are requested."""
+    env_key = _get_visible_devices_env()
+    if not env_key:
+        return
+
+    # Save old environment for logic checks, monkeypatch handles restoration
+    old_value = os.environ.get(env_key)
+
+    marker = request.node.get_closest_marker("require_distributed")
+    if marker:  # distributed test
+        required = marker.args[0] if marker.args else 2
+        specific_devices = marker.args[1] if len(marker.args) > 1 else None
+
+        if specific_devices:
+            devices_str = ",".join(map(str, specific_devices))
+        else:
+            devices_str = ",".join(str(i) for i in range(required))
+
+        monkeypatch.setenv(env_key, devices_str)
+        monkeypatch.syspath_prepend(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+    else:  # non-distributed test
+        if old_value:
+            visible_devices = [v for v in old_value.split(",") if v != ""]
+            monkeypatch.setenv(env_key, visible_devices[0] if visible_devices else "0")
+        else:
+            monkeypatch.setenv(env_key, "0")
+
+        if CURRENT_DEVICE == "cuda":
+            monkeypatch.setattr(torch.cuda, "device_count", lambda: 1)
+        elif CURRENT_DEVICE == "npu":
+            monkeypatch.setattr(torch.npu, "device_count", lambda: 1)
+
+
+@pytest.fixture
+def fix_valuehead_cpu_loading():
+    """Fix valuehead model loading."""
+    patch_valuehead_model()
diff --git a/LlamaFactory/tests/data/processor/test_feedback.py b/LlamaFactory/tests/data/processor/test_feedback.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2c69d08a2642ed566470a1fb093d7a30f7c9ff0
--- /dev/null
+++ b/LlamaFactory/tests/data/processor/test_feedback.py
@@ -0,0 +1,65 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import pytest
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.extras.packages import is_transformers_version_greater_than
+from llamafactory.train.test_utils import load_dataset_module
+
+
+DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data")
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "stage": "kto",
+    "do_train": True,
+    "finetuning_type": "full",
+    "dataset": "kto_en_demo",
+    "dataset_dir": "REMOTE:" + DEMO_DATA,
+    "template": "llama3",
+    "cutoff_len": 8192,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.parametrize("num_samples", [16])
+def test_feedback_data(num_samples: int):
+    train_dataset = load_dataset_module(**TRAIN_ARGS)["train_dataset"]
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    original_data = load_dataset(DEMO_DATA, name="kto_en_demo", split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        messages = original_data["messages"][index]
+        ref_input_ids = ref_tokenizer.apply_chat_template(messages)
+        ref_prompt_ids = ref_tokenizer.apply_chat_template(messages[:-1], add_generation_prompt=True)
+        if is_transformers_version_greater_than("5.0.0"):
+            ref_input_ids = ref_input_ids["input_ids"]
+            ref_prompt_ids = ref_prompt_ids["input_ids"]
+
+        prompt_len = len(ref_prompt_ids)
+        ref_labels = [IGNORE_INDEX] * prompt_len + ref_input_ids[prompt_len:]
+        assert train_dataset["input_ids"][index] == ref_input_ids
+        assert train_dataset["labels"][index] == ref_labels
+        assert train_dataset["kto_tags"][index] == original_data["label"][index]
diff --git a/LlamaFactory/tests/data/processor/test_pairwise.py b/LlamaFactory/tests/data/processor/test_pairwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..17d5609845571c67037159994ceaae9f62f9f69e
--- /dev/null
+++ b/LlamaFactory/tests/data/processor/test_pairwise.py
@@ -0,0 +1,86 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import pytest
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.extras.packages import is_transformers_version_greater_than
+from llamafactory.train.test_utils import load_dataset_module
+
+
+DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data")
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "stage": "rm",
+    "do_train": True,
+    "finetuning_type": "full",
+    "dataset": "dpo_en_demo",
+    "dataset_dir": "REMOTE:" + DEMO_DATA,
+    "template": "llama3",
+    "cutoff_len": 8192,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+
+def _convert_sharegpt_to_openai(messages: list[dict[str, str]]) -> list[dict[str, str]]:
+    role_mapping = {"human": "user", "gpt": "assistant", "system": "system"}
+    new_messages = []
+    for message in messages:
+        new_messages.append({"role": role_mapping[message["from"]], "content": message["value"]})
+
+    return new_messages
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.parametrize("num_samples", [16])
+def test_pairwise_data(num_samples: int):
+    train_dataset = load_dataset_module(**TRAIN_ARGS)["train_dataset"]
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    original_data = load_dataset(DEMO_DATA, name="dpo_en_demo", split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        chosen_messages = original_data["conversations"][index] + [original_data["chosen"][index]]
+        rejected_messages = original_data["conversations"][index] + [original_data["rejected"][index]]
+        chosen_messages = _convert_sharegpt_to_openai(chosen_messages)
+        rejected_messages = _convert_sharegpt_to_openai(rejected_messages)
+
+        ref_chosen_input_ids = ref_tokenizer.apply_chat_template(chosen_messages)
+        ref_chosen_prompt_ids = ref_tokenizer.apply_chat_template(chosen_messages[:-1], add_generation_prompt=True)
+        ref_rejected_input_ids = ref_tokenizer.apply_chat_template(rejected_messages)
+        ref_rejected_prompt_ids = ref_tokenizer.apply_chat_template(rejected_messages[:-1], add_generation_prompt=True)
+
+        if is_transformers_version_greater_than("5.0.0"):
+            ref_chosen_input_ids = ref_chosen_input_ids["input_ids"]
+            ref_rejected_input_ids = ref_rejected_input_ids["input_ids"]
+            ref_chosen_prompt_ids = ref_chosen_prompt_ids["input_ids"]
+            ref_rejected_prompt_ids = ref_rejected_prompt_ids["input_ids"]
+
+        chosen_prompt_len = len(ref_chosen_prompt_ids)
+        rejected_prompt_len = len(ref_rejected_prompt_ids)
+        ref_chosen_labels = [IGNORE_INDEX] * chosen_prompt_len + ref_chosen_input_ids[chosen_prompt_len:]
+        ref_rejected_labels = [IGNORE_INDEX] * rejected_prompt_len + ref_rejected_input_ids[rejected_prompt_len:]
+        assert train_dataset["chosen_input_ids"][index] == ref_chosen_input_ids
+        assert train_dataset["chosen_labels"][index] == ref_chosen_labels
+        assert train_dataset["rejected_input_ids"][index] == ref_rejected_input_ids
+        assert train_dataset["rejected_labels"][index] == ref_rejected_labels
diff --git a/LlamaFactory/tests/data/processor/test_processor_utils.py b/LlamaFactory/tests/data/processor/test_processor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2a3b7ebe05e15b9dfaf603853db960a7dfdb650
--- /dev/null
+++ b/LlamaFactory/tests/data/processor/test_processor_utils.py
@@ -0,0 +1,35 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+
+from llamafactory.data.processor.processor_utils import infer_seqlen
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.parametrize(
+    "test_input,test_output",
+    [
+        ((3000, 2000, 1000), (600, 400)),
+        ((2000, 3000, 1000), (400, 600)),
+        ((1000, 100, 1000), (900, 100)),
+        ((100, 1000, 1000), (100, 900)),
+        ((100, 500, 1000), (100, 500)),
+        ((500, 100, 1000), (500, 100)),
+        ((10, 10, 1000), (10, 10)),
+    ],
+)
+def test_infer_seqlen(test_input: tuple[int, int, int], test_output: tuple[int, int]):
+    assert test_output == infer_seqlen(*test_input)
diff --git a/LlamaFactory/tests/data/processor/test_supervised.py b/LlamaFactory/tests/data/processor/test_supervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..f515852e1982c6c73fe64465b40ac48283317f05
--- /dev/null
+++ b/LlamaFactory/tests/data/processor/test_supervised.py
@@ -0,0 +1,132 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import pytest
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.extras.packages import is_transformers_version_greater_than
+from llamafactory.train.test_utils import load_dataset_module
+
+
+DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data")
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+TINY_DATA = os.getenv("TINY_DATA", "llamafactory/tiny-supervised-dataset")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "full",
+    "template": "llama3",
+    "cutoff_len": 8192,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.parametrize("num_samples", [16])
+def test_supervised_single_turn(num_samples: int):
+    train_dataset = load_dataset_module(dataset_dir="ONLINE", dataset=TINY_DATA, **TRAIN_ARGS)["train_dataset"]
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    original_data = load_dataset(TINY_DATA, split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        prompt = original_data["instruction"][index]
+        if original_data["input"][index]:
+            prompt += "\n" + original_data["input"][index]
+
+        messages = [
+            {"role": "user", "content": prompt},
+            {"role": "assistant", "content": original_data["output"][index]},
+        ]
+        ref_input_ids = ref_tokenizer.apply_chat_template(messages)
+        ref_prompt_ids = ref_tokenizer.apply_chat_template(messages[:-1], add_generation_prompt=True)
+
+        if is_transformers_version_greater_than("5.0.0"):
+            ref_input_ids = ref_input_ids["input_ids"]
+            ref_prompt_ids = ref_prompt_ids["input_ids"]
+
+        prompt_len = len(ref_prompt_ids)
+        ref_label_ids = [IGNORE_INDEX] * prompt_len + ref_input_ids[prompt_len:]
+        assert train_dataset["input_ids"][index] == ref_input_ids
+        assert train_dataset["labels"][index] == ref_label_ids
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.parametrize("num_samples", [8])
+def test_supervised_multi_turn(num_samples: int):
+    train_dataset = load_dataset_module(dataset_dir="REMOTE:" + DEMO_DATA, dataset="system_chat", **TRAIN_ARGS)[
+        "train_dataset"
+    ]
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    original_data = load_dataset(DEMO_DATA, name="system_chat", split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        ref_input_ids = ref_tokenizer.apply_chat_template(original_data["messages"][index])
+        if is_transformers_version_greater_than("5.0.0"):
+            ref_input_ids = ref_input_ids["input_ids"]
+
+        # cannot test the label ids in multi-turn case
+        assert train_dataset["input_ids"][index] == ref_input_ids
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.parametrize("num_samples", [4])
+def test_supervised_train_on_prompt(num_samples: int):
+    train_dataset = load_dataset_module(
+        dataset_dir="REMOTE:" + DEMO_DATA, dataset="system_chat", train_on_prompt=True, **TRAIN_ARGS
+    )["train_dataset"]
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    original_data = load_dataset(DEMO_DATA, name="system_chat", split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        ref_input_ids = ref_tokenizer.apply_chat_template(original_data["messages"][index])
+        if is_transformers_version_greater_than("5.0.0"):
+            ref_input_ids = ref_input_ids["input_ids"]
+
+        assert train_dataset["input_ids"][index] == ref_input_ids
+        assert train_dataset["labels"][index] == ref_input_ids
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.parametrize("num_samples", [4])
+def test_supervised_mask_history(num_samples: int):
+    train_dataset = load_dataset_module(
+        dataset_dir="REMOTE:" + DEMO_DATA, dataset="system_chat", mask_history=True, **TRAIN_ARGS
+    )["train_dataset"]
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    original_data = load_dataset(DEMO_DATA, name="system_chat", split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        messages = original_data["messages"][index]
+        ref_input_ids = ref_tokenizer.apply_chat_template(messages)
+        ref_prompt_ids = ref_tokenizer.apply_chat_template(messages[:-1], add_generation_prompt=True)
+
+        if is_transformers_version_greater_than("5.0.0"):
+            ref_input_ids = ref_input_ids["input_ids"]
+            ref_prompt_ids = ref_prompt_ids["input_ids"]
+
+        prompt_len = len(ref_prompt_ids)
+        ref_label_ids = [IGNORE_INDEX] * prompt_len + ref_input_ids[prompt_len:]
+        assert train_dataset["input_ids"][index] == ref_input_ids
+        assert train_dataset["labels"][index] == ref_label_ids
diff --git a/LlamaFactory/tests/data/processor/test_unsupervised.py b/LlamaFactory/tests/data/processor/test_unsupervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d06219fc582d4cb91f161c95ffe9f14fb5d324f
--- /dev/null
+++ b/LlamaFactory/tests/data/processor/test_unsupervised.py
@@ -0,0 +1,68 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import pytest
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from llamafactory.extras.packages import is_transformers_version_greater_than
+from llamafactory.train.test_utils import load_dataset_module
+
+
+DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data")
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+TINY_DATA = os.getenv("TINY_DATA", "llamafactory/tiny-supervised-dataset")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "stage": "ppo",
+    "do_train": True,
+    "finetuning_type": "full",
+    "reward_model": "",
+    "reward_model_type": "full",
+    "dataset": "system_chat",
+    "dataset_dir": "REMOTE:" + DEMO_DATA,
+    "template": "llama3",
+    "cutoff_len": 8192,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+    "report_to": "none",  # transfromers compatibility
+}
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.parametrize("num_samples", [16])
+def test_unsupervised_data(num_samples: int):
+    train_dataset = load_dataset_module(**TRAIN_ARGS)["train_dataset"]
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    original_data = load_dataset(DEMO_DATA, name="system_chat", split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        messages = original_data["messages"][index]
+        ref_input_ids = ref_tokenizer.apply_chat_template(messages)
+        ref_prompt_ids = ref_tokenizer.apply_chat_template(messages[:-1], add_generation_prompt=True)
+
+        if is_transformers_version_greater_than("5.0.0"):
+            ref_input_ids = ref_input_ids["input_ids"]
+            ref_prompt_ids = ref_prompt_ids["input_ids"]
+
+        ref_labels = ref_input_ids[len(ref_prompt_ids) :]
+        assert train_dataset["input_ids"][index] == ref_prompt_ids
+        assert train_dataset["labels"][index] == ref_labels
diff --git a/LlamaFactory/tests/data/test_collator.py b/LlamaFactory/tests/data/test_collator.py
new file mode 100644
index 0000000000000000000000000000000000000000..63370b1b68509b667761357e074fd3d74a715ab7
--- /dev/null
+++ b/LlamaFactory/tests/data/test_collator.py
@@ -0,0 +1,173 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+import torch
+from PIL import Image
+from transformers import AutoConfig, AutoModelForImageTextToText
+
+from llamafactory.data import get_template_and_fix_tokenizer
+from llamafactory.data.collator import MultiModalDataCollatorForSeq2Seq, prepare_4d_attention_mask
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.hparams import get_infer_args
+from llamafactory.model import load_tokenizer
+
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_base_collator():
+    model_args, data_args, *_ = get_infer_args({"model_name_or_path": TINY_LLAMA3, "template": "default"})
+    tokenizer_module = load_tokenizer(model_args)
+    template = get_template_and_fix_tokenizer(tokenizer_module["tokenizer"], data_args)
+    data_collator = MultiModalDataCollatorForSeq2Seq(
+        template=template,
+        pad_to_multiple_of=8,
+        label_pad_token_id=IGNORE_INDEX,
+        **tokenizer_module,
+    )
+    p = tokenizer_module["tokenizer"].pad_token_id
+    q = IGNORE_INDEX
+    features = [
+        {
+            "input_ids": [0, 1, 2, 3, 4, 5],
+            "attention_mask": [1, 1, 1, 1, 1, 1],
+            "labels": [q, q, 2, 3, 4, 5],
+        },
+        {
+            "input_ids": [6, 7],
+            "attention_mask": [1, 1],
+            "labels": [q, 7],
+        },
+    ]
+    batch_input = data_collator(features)
+    expected_input = {
+        "input_ids": [
+            [0, 1, 2, 3, 4, 5, p, p],
+            [6, 7, p, p, p, p, p, p],
+        ],
+        "attention_mask": [
+            [1, 1, 1, 1, 1, 1, 0, 0],
+            [1, 1, 0, 0, 0, 0, 0, 0],
+        ],
+        "labels": [
+            [q, q, 2, 3, 4, 5, q, q],
+            [q, 7, q, q, q, q, q, q],
+        ],
+    }
+    for k in batch_input.keys():
+        assert batch_input[k].eq(torch.tensor(expected_input[k])).all()
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_multimodal_collator():
+    model_args, data_args, *_ = get_infer_args(
+        {"model_name_or_path": "Qwen/Qwen2-VL-2B-Instruct", "template": "qwen2_vl"}
+    )
+    tokenizer_module = load_tokenizer(model_args)
+    template = get_template_and_fix_tokenizer(tokenizer_module["tokenizer"], data_args)
+    config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+    with torch.device("meta"):
+        model = AutoModelForImageTextToText.from_config(config)
+
+    data_collator = MultiModalDataCollatorForSeq2Seq(
+        template=template,
+        model=model,
+        pad_to_multiple_of=4,
+        label_pad_token_id=IGNORE_INDEX,
+        **tokenizer_module,
+    )
+    p = tokenizer_module["tokenizer"].pad_token_id
+    q = IGNORE_INDEX
+    s = tokenizer_module["tokenizer"].convert_tokens_to_ids("<|vision_start|>")
+    e = tokenizer_module["tokenizer"].convert_tokens_to_ids("<|vision_end|>")
+    m = tokenizer_module["tokenizer"].convert_tokens_to_ids("<|image_pad|>")
+    fake_image = Image.new("RGB", (64, 64), (255, 255, 255))
+
+    features = [
+        {
+            "input_ids": [0, 1, 2, 3],
+            "attention_mask": [1, 1, 1, 1],
+            "labels": [0, 1, 2, 3],
+        },
+    ]
+    batch_input = data_collator(features)
+    expected_input = {
+        "input_ids": [
+            [0, 1, 2, 3, s, m, m, m, m, e, p, p],
+        ],
+        "attention_mask": [
+            [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+        ],
+        "labels": [
+            [0, 1, 2, 3, q, q, q, q, q, q, q, q],
+        ],
+        "position_ids": [
+            [[0, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1]],
+            [[0, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1]],
+            [[0, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1]],
+        ],
+        "rope_deltas": [[-8]],
+        **tokenizer_module["processor"].image_processor(fake_image),
+    }
+    assert batch_input.keys() == expected_input.keys()
+    for k in batch_input.keys():
+        assert batch_input[k].eq(torch.tensor(expected_input[k])).all()
+
+
+@pytest.mark.runs_on(["cpu"])
+def test_4d_attention_mask():
+    o = 0.0
+    x = torch.finfo(torch.float16).min
+    attention_mask_with_indices = torch.tensor(
+        [
+            [1, 1, 2, 2, 2, 0],
+            [1, 2, 2, 3, 3, 3],
+        ]
+    )
+    attention_mask_computed = prepare_4d_attention_mask(attention_mask_with_indices, torch.float16)
+    attention_mask_expected = torch.tensor(
+        [
+            [
+                [
+                    [o, x, x, x, x, x],
+                    [o, o, x, x, x, x],
+                    [x, x, o, x, x, x],
+                    [x, x, o, o, x, x],
+                    [x, x, o, o, o, x],
+                    [x, x, x, x, x, x],
+                ]
+            ],
+            [
+                [
+                    [o, x, x, x, x, x],
+                    [x, o, x, x, x, x],
+                    [x, o, o, x, x, x],
+                    [x, x, x, o, x, x],
+                    [x, x, x, o, o, x],
+                    [x, x, x, o, o, o],
+                ]
+            ],
+        ],
+        dtype=torch.float16,
+    )
+    assert list(attention_mask_computed.size()) == [2, 1, 6, 6]
+    assert torch.all(attention_mask_computed == attention_mask_expected)
+
+
+if __name__ == "__main__":
+    test_multimodal_collator()
diff --git a/LlamaFactory/tests/data/test_converter.py b/LlamaFactory/tests/data/test_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b411aed53ce66aae1a2e078ab82508cc94378f9
--- /dev/null
+++ b/LlamaFactory/tests/data/test_converter.py
@@ -0,0 +1,64 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from llamafactory.data import Role
+from llamafactory.data.converter import get_dataset_converter
+from llamafactory.data.parser import DatasetAttr
+from llamafactory.hparams import DataArguments
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_alpaca_converter():
+    dataset_attr = DatasetAttr("hf_hub", "llamafactory/tiny-supervised-dataset")
+    data_args = DataArguments()
+    example = {
+        "instruction": "Solve the math problem.",
+        "input": "3 + 4",
+        "output": "The answer is 7.",
+    }
+    dataset_converter = get_dataset_converter("alpaca", dataset_attr, data_args)
+    assert dataset_converter(example) == {
+        "_prompt": [{"role": Role.USER.value, "content": "Solve the math problem.\n3 + 4"}],
+        "_response": [{"role": Role.ASSISTANT.value, "content": "The answer is 7."}],
+        "_system": "",
+        "_tools": "",
+        "_images": None,
+        "_videos": None,
+        "_audios": None,
+    }
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_sharegpt_converter():
+    dataset_attr = DatasetAttr("hf_hub", "llamafactory/tiny-supervised-dataset")
+    data_args = DataArguments()
+    example = {
+        "conversations": [
+            {"from": "system", "value": "You are a helpful assistant."},
+            {"from": "human", "value": "Solve the math problem.\n3 + 4"},
+            {"from": "gpt", "value": "The answer is 7."},
+        ]
+    }
+    dataset_converter = get_dataset_converter("sharegpt", dataset_attr, data_args)
+    assert dataset_converter(example) == {
+        "_prompt": [{"role": Role.USER.value, "content": "Solve the math problem.\n3 + 4"}],
+        "_response": [{"role": Role.ASSISTANT.value, "content": "The answer is 7."}],
+        "_system": "You are a helpful assistant.",
+        "_tools": "",
+        "_images": None,
+        "_videos": None,
+        "_audios": None,
+    }
diff --git a/LlamaFactory/tests/data/test_formatter.py b/LlamaFactory/tests/data/test_formatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aaa6f991da21294e6c80f4bcc3f21da9d89df9d
--- /dev/null
+++ b/LlamaFactory/tests/data/test_formatter.py
@@ -0,0 +1,382 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from datetime import datetime
+
+import pytest
+
+from llamafactory.data.formatter import EmptyFormatter, FunctionFormatter, StringFormatter, ToolFormatter
+
+
+FUNCTION = {"name": "tool_name", "arguments": {"foo": "bar", "size": 10}}
+
+TOOLS = [
+    {
+        "name": "test_tool",
+        "description": "tool_desc",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "foo": {"type": "string", "description": "foo_desc"},
+                "bar": {"type": "number", "description": "bar_desc"},
+            },
+            "required": ["foo"],
+        },
+    }
+]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_empty_formatter():
+    formatter = EmptyFormatter(slots=["\n"])
+    assert formatter.apply() == ["\n"]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_string_formatter():
+    formatter = StringFormatter(slots=["<s>", "Human: {{content}}\nAssistant:"])
+    assert formatter.apply(content="Hi") == ["<s>", "Human: Hi\nAssistant:"]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_function_formatter():
+    formatter = FunctionFormatter(slots=["{{content}}", "</s>"], tool_format="default")
+    tool_calls = json.dumps(FUNCTION)
+    assert formatter.apply(content=tool_calls) == [
+        """Action: tool_name\nAction Input: {"foo": "bar", "size": 10}""",
+        "</s>",
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_multi_function_formatter():
+    formatter = FunctionFormatter(slots=["{{content}}", "</s>"], tool_format="default")
+    tool_calls = json.dumps([FUNCTION] * 2)
+    assert formatter.apply(content=tool_calls) == [
+        """Action: tool_name\nAction Input: {"foo": "bar", "size": 10}\n"""
+        """Action: tool_name\nAction Input: {"foo": "bar", "size": 10}""",
+        "</s>",
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_default_tool_formatter():
+    formatter = ToolFormatter(tool_format="default")
+    assert formatter.apply(content=json.dumps(TOOLS)) == [
+        "You have access to the following tools:\n"
+        "> Tool Name: test_tool\n"
+        "Tool Description: tool_desc\n"
+        "Tool Args:\n"
+        "  - foo (string, required): foo_desc\n"
+        "  - bar (number): bar_desc\n\n"
+        "Use the following format if using a tool:\n"
+        "```\n"
+        "Action: tool name (one of [test_tool])\n"
+        "Action Input: the input to the tool, in a JSON format representing the kwargs "
+        """(e.g. ```{"input": "hello world", "num_beams": 5}```)\n"""
+        "```\n"
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_default_tool_extractor():
+    formatter = ToolFormatter(tool_format="default")
+    result = """Action: test_tool\nAction Input: {"foo": "bar", "size": 10}"""
+    assert formatter.extract(result) == [("test_tool", """{"foo": "bar", "size": 10}""")]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_default_multi_tool_extractor():
+    formatter = ToolFormatter(tool_format="default")
+    result = (
+        """Action: test_tool\nAction Input: {"foo": "bar", "size": 10}\n"""
+        """Action: another_tool\nAction Input: {"foo": "job", "size": 2}"""
+    )
+    assert formatter.extract(result) == [
+        ("test_tool", """{"foo": "bar", "size": 10}"""),
+        ("another_tool", """{"foo": "job", "size": 2}"""),
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_glm4_function_formatter():
+    formatter = FunctionFormatter(slots=["{{content}}"], tool_format="glm4")
+    tool_calls = json.dumps(FUNCTION)
+    assert formatter.apply(content=tool_calls) == ["""tool_name\n{"foo": "bar", "size": 10}"""]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_glm4_tool_formatter():
+    formatter = ToolFormatter(tool_format="glm4")
+    assert formatter.apply(content=json.dumps(TOOLS)) == [
+        "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱 AI 公司训练的语言模型 GLM-4 模型开发的，"
+        "你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具\n\n"
+        f"## test_tool\n\n{json.dumps(TOOLS[0], indent=4, ensure_ascii=False)}\n"
+        "在调用上述函数时，请使用 Json 格式表示调用的参数。"
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_glm4_tool_extractor():
+    formatter = ToolFormatter(tool_format="glm4")
+    result = """test_tool\n{"foo": "bar", "size": 10}\n"""
+    assert formatter.extract(result) == [("test_tool", """{"foo": "bar", "size": 10}""")]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_llama3_function_formatter():
+    formatter = FunctionFormatter(slots=["{{content}}<|eot_id|>"], tool_format="llama3")
+    tool_calls = json.dumps(FUNCTION)
+    assert formatter.apply(content=tool_calls) == [
+        """{"name": "tool_name", "parameters": {"foo": "bar", "size": 10}}<|eot_id|>"""
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_llama3_multi_function_formatter():
+    formatter = FunctionFormatter(slots=["{{content}}<|eot_id|>"], tool_format="llama3")
+    tool_calls = json.dumps([FUNCTION] * 2)
+    assert formatter.apply(content=tool_calls) == [
+        """[{"name": "tool_name", "parameters": {"foo": "bar", "size": 10}}, """
+        """{"name": "tool_name", "parameters": {"foo": "bar", "size": 10}}]"""
+        """<|eot_id|>"""
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_llama3_tool_formatter():
+    formatter = ToolFormatter(tool_format="llama3")
+    date = datetime.now().strftime("%d %b %Y")
+    wrapped_tool = {"type": "function", "function": TOOLS[0]}
+    assert formatter.apply(content=json.dumps(TOOLS)) == [
+        f"Cutting Knowledge Date: December 2023\nToday Date: {date}\n\n"
+        "You have access to the following functions. "
+        "To call a function, please respond with JSON for a function call. "
+        """Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. """
+        f"Do not use variables.\n\n{json.dumps(wrapped_tool, indent=4, ensure_ascii=False)}\n\n"
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_llama3_tool_extractor():
+    formatter = ToolFormatter(tool_format="llama3")
+    result = """{"name": "test_tool", "parameters": {"foo": "bar", "size": 10}}\n"""
+    assert formatter.extract(result) == [("test_tool", """{"foo": "bar", "size": 10}""")]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_llama3_multi_tool_extractor():
+    formatter = ToolFormatter(tool_format="llama3")
+    result = (
+        """[{"name": "test_tool", "parameters": {"foo": "bar", "size": 10}}, """
+        """{"name": "another_tool", "parameters": {"foo": "job", "size": 2}}]"""
+    )
+    assert formatter.extract(result) == [
+        ("test_tool", """{"foo": "bar", "size": 10}"""),
+        ("another_tool", """{"foo": "job", "size": 2}"""),
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_mistral_function_formatter():
+    formatter = FunctionFormatter(slots=["[TOOL_CALLS] {{content}}", "</s>"], tool_format="mistral")
+    tool_calls = json.dumps(FUNCTION)
+    assert formatter.apply(content=tool_calls) == [
+        "[TOOL_CALLS] " """[{"name": "tool_name", "arguments": {"foo": "bar", "size": 10}}]""",
+        "</s>",
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_mistral_multi_function_formatter():
+    formatter = FunctionFormatter(slots=["[TOOL_CALLS] {{content}}", "</s>"], tool_format="mistral")
+    tool_calls = json.dumps([FUNCTION] * 2)
+    assert formatter.apply(content=tool_calls) == [
+        "[TOOL_CALLS] "
+        """[{"name": "tool_name", "arguments": {"foo": "bar", "size": 10}}, """
+        """{"name": "tool_name", "arguments": {"foo": "bar", "size": 10}}]""",
+        "</s>",
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_mistral_tool_formatter():
+    formatter = ToolFormatter(tool_format="mistral")
+    wrapped_tool = {"type": "function", "function": TOOLS[0]}
+    assert formatter.apply(content=json.dumps(TOOLS)) == [
+        "[AVAILABLE_TOOLS] " + json.dumps([wrapped_tool], ensure_ascii=False) + "[/AVAILABLE_TOOLS]"
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_mistral_tool_extractor():
+    formatter = ToolFormatter(tool_format="mistral")
+    result = """{"name": "test_tool", "arguments": {"foo": "bar", "size": 10}}"""
+    assert formatter.extract(result) == [("test_tool", """{"foo": "bar", "size": 10}""")]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_mistral_multi_tool_extractor():
+    formatter = ToolFormatter(tool_format="mistral")
+    result = (
+        """[{"name": "test_tool", "arguments": {"foo": "bar", "size": 10}}, """
+        """{"name": "another_tool", "arguments": {"foo": "job", "size": 2}}]"""
+    )
+    assert formatter.extract(result) == [
+        ("test_tool", """{"foo": "bar", "size": 10}"""),
+        ("another_tool", """{"foo": "job", "size": 2}"""),
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_qwen_function_formatter():
+    formatter = FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen")
+    tool_calls = json.dumps(FUNCTION)
+    assert formatter.apply(content=tool_calls) == [
+        """<tool_call>\n{"name": "tool_name", "arguments": {"foo": "bar", "size": 10}}\n</tool_call><|im_end|>\n"""
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_qwen_multi_function_formatter():
+    formatter = FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen")
+    tool_calls = json.dumps([FUNCTION] * 2)
+    assert formatter.apply(content=tool_calls) == [
+        """<tool_call>\n{"name": "tool_name", "arguments": {"foo": "bar", "size": 10}}\n</tool_call>\n"""
+        """<tool_call>\n{"name": "tool_name", "arguments": {"foo": "bar", "size": 10}}\n</tool_call>"""
+        "<|im_end|>\n"
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_qwen_tool_formatter():
+    formatter = ToolFormatter(tool_format="qwen")
+    wrapped_tool = {"type": "function", "function": TOOLS[0]}
+    assert formatter.apply(content=json.dumps(TOOLS)) == [
+        "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\n"
+        "You are provided with function signatures within <tools></tools> XML tags:\n<tools>"
+        f"\n{json.dumps(wrapped_tool, ensure_ascii=False)}"
+        "\n</tools>\n\nFor each function call, return a json object with function name and arguments within "
+        """<tool_call></tool_call> XML tags:\n<tool_call>\n{"name": <function-name>, """
+        """"arguments": <args-json-object>}\n</tool_call>"""
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_qwen_tool_extractor():
+    formatter = ToolFormatter(tool_format="qwen")
+    result = """<tool_call>\n{"name": "test_tool", "arguments": {"foo": "bar", "size": 10}}\n</tool_call>"""
+    assert formatter.extract(result) == [("test_tool", """{"foo": "bar", "size": 10}""")]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_qwen_multi_tool_extractor():
+    formatter = ToolFormatter(tool_format="qwen")
+    result = (
+        """<tool_call>\n{"name": "test_tool", "arguments": {"foo": "bar", "size": 10}}\n</tool_call>\n"""
+        """<tool_call>\n{"name": "another_tool", "arguments": {"foo": "job", "size": 2}}\n</tool_call>"""
+    )
+    assert formatter.extract(result) == [
+        ("test_tool", """{"foo": "bar", "size": 10}"""),
+        ("another_tool", """{"foo": "job", "size": 2}"""),
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_function_formatter():
+    formatter = FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="lfm2")
+    tool_calls = json.dumps(FUNCTION)
+    assert formatter.apply(content=tool_calls) == [
+        """<|tool_call_start|>[tool_name(foo="bar", size=10)]<|tool_call_end|><|im_end|>\n"""
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_multi_function_formatter():
+    formatter = FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="lfm2")
+    tool_calls = json.dumps([FUNCTION] * 2)
+    assert formatter.apply(content=tool_calls) == [
+        """<|tool_call_start|>[tool_name(foo="bar", size=10), tool_name(foo="bar", size=10)]<|tool_call_end|>"""
+        "<|im_end|>\n"
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_tool_formatter():
+    formatter = ToolFormatter(tool_format="lfm2")
+    assert formatter.apply(content=json.dumps(TOOLS)) == [
+        "List of tools: <|tool_list_start|>" + json.dumps(TOOLS, ensure_ascii=False) + "<|tool_list_end|>"
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_tool_extractor():
+    formatter = ToolFormatter(tool_format="lfm2")
+    result = """<|tool_call_start|>[test_tool(foo="bar", size=10)]<|tool_call_end|>"""
+    assert formatter.extract(result) == [("test_tool", """{"foo": "bar", "size": 10}""")]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_multi_tool_extractor():
+    formatter = ToolFormatter(tool_format="lfm2")
+    result = """<|tool_call_start|>[test_tool(foo="bar", size=10), another_tool(foo="job", size=2)]<|tool_call_end|>"""
+    assert formatter.extract(result) == [
+        ("test_tool", """{"foo": "bar", "size": 10}"""),
+        ("another_tool", """{"foo": "job", "size": 2}"""),
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_tool_extractor_with_nested_dict():
+    formatter = ToolFormatter(tool_format="lfm2")
+    result = """<|tool_call_start|>[search(query="test", options={"limit": 10, "offset": 0})]<|tool_call_end|>"""
+    extracted = formatter.extract(result)
+    assert len(extracted) == 1
+    assert extracted[0][0] == "search"
+    args = json.loads(extracted[0][1])
+    assert args["query"] == "test"
+    assert args["options"] == {"limit": 10, "offset": 0}
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_tool_extractor_with_list_arg():
+    formatter = ToolFormatter(tool_format="lfm2")
+    result = """<|tool_call_start|>[batch_process(items=[1, 2, 3], enabled=True)]<|tool_call_end|>"""
+    extracted = formatter.extract(result)
+    assert len(extracted) == 1
+    assert extracted[0][0] == "batch_process"
+    args = json.loads(extracted[0][1])
+    assert args["items"] == [1, 2, 3]
+    assert args["enabled"] is True
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_tool_extractor_no_match():
+    formatter = ToolFormatter(tool_format="lfm2")
+    result = "This is a regular response without tool calls."
+    extracted = formatter.extract(result)
+    assert extracted == result
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_tool_round_trip():
+    formatter = FunctionFormatter(slots=["{{content}}"], tool_format="lfm2")
+    tool_formatter = ToolFormatter(tool_format="lfm2")
+    original = {"name": "my_func", "arguments": {"arg1": "hello", "arg2": 42, "arg3": True}}
+    formatted = formatter.apply(content=json.dumps(original))
+    extracted = tool_formatter.extract(formatted[0])
+    assert len(extracted) == 1
+    assert extracted[0][0] == original["name"]
+    assert json.loads(extracted[0][1]) == original["arguments"]
diff --git a/LlamaFactory/tests/data/test_loader.py b/LlamaFactory/tests/data/test_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..907bda347d18807862f59c5c38d2f162cb035d12
--- /dev/null
+++ b/LlamaFactory/tests/data/test_loader.py
@@ -0,0 +1,61 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from llamafactory.train.test_utils import load_dataset_module
+
+
+DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data")
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+TINY_DATA = os.getenv("TINY_DATA", "llamafactory/tiny-supervised-dataset")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "full",
+    "template": "llama3",
+    "dataset": TINY_DATA,
+    "dataset_dir": "ONLINE",
+    "cutoff_len": 8192,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_load_train_only():
+    dataset_module = load_dataset_module(**TRAIN_ARGS)
+    assert dataset_module.get("train_dataset") is not None
+    assert dataset_module.get("eval_dataset") is None
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_load_val_size():
+    dataset_module = load_dataset_module(val_size=0.1, **TRAIN_ARGS)
+    assert dataset_module.get("train_dataset") is not None
+    assert dataset_module.get("eval_dataset") is not None
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_load_eval_data():
+    dataset_module = load_dataset_module(eval_dataset=TINY_DATA, **TRAIN_ARGS)
+    assert dataset_module.get("train_dataset") is not None
+    assert dataset_module.get("eval_dataset") is not None
diff --git a/LlamaFactory/tests/data/test_mm_plugin.py b/LlamaFactory/tests/data/test_mm_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..3187004aa5dcda9f2327370d4af47381a4ff5af2
--- /dev/null
+++ b/LlamaFactory/tests/data/test_mm_plugin.py
@@ -0,0 +1,433 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+import pytest
+import torch
+from PIL import Image
+
+from llamafactory.data.mm_plugin import get_mm_plugin
+from llamafactory.extras.packages import is_transformers_version_greater_than
+from llamafactory.hparams import get_infer_args
+from llamafactory.model import load_tokenizer
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, ProcessorMixin
+    from transformers.image_processing_utils import BaseImageProcessor
+
+    from llamafactory.data.mm_plugin import BasePlugin
+    from llamafactory.model.loader import TokenizerModule
+
+
+HF_TOKEN = os.getenv("HF_TOKEN")
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA4 = os.getenv("TINY_LLAMA4", "llamafactory/tiny-random-Llama-4")
+
+MM_MESSAGES = [
+    {"role": "user", "content": "<image>What is in this image?"},
+    {"role": "assistant", "content": "A cat."},
+]
+
+OMNI_MESSAGES = [
+    {"role": "user", "content": "<image>What is in this image?"},
+    {"role": "assistant", "content": "A cat."},
+    {"role": "user", "content": "<audio>What is in this audio?"},
+    {"role": "assistant", "content": "Nothing."},
+]
+
+TEXT_MESSAGES = [
+    {"role": "user", "content": "How are you"},
+    {"role": "assistant", "content": "I am fine!"},
+]
+
+VIDEO_MESSAGES = [
+    {"role": "user", "content": "<video>What is in this viode?"},
+    {"role": "assistant", "content": "A cat."},
+]
+
+AUDIOS = [np.zeros(1600)]
+
+IMAGES = [Image.new("RGB", (32, 32), (255, 255, 255))]
+
+VIDEOS = [[Image.new("RGB", (32, 32), (255, 255, 255))] * 4]
+
+NO_IMAGES = []
+
+NO_VIDEOS = []
+
+NO_AUDIOS = []
+
+IMGLENS = [1]
+
+AUDLENS = [1]
+
+NO_IMGLENS = [0]
+
+NO_VIDLENS = [0]
+
+NO_AUDLENS = [0]
+
+INPUT_IDS = [0, 1, 2, 3, 4]
+
+LABELS = [0, 1, 2, 3, 4]
+
+BATCH_IDS = [[1] * 1024]
+
+
+def _get_mm_inputs(processor: "ProcessorMixin") -> dict[str, "torch.Tensor"]:
+    image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+    return image_processor(images=IMAGES, return_tensors="pt")
+
+
+def _get_omni_inputs(processor: "ProcessorMixin") -> dict[str, "torch.Tensor"]:
+    mm_inputs = {}
+    image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+    feature_extractor = getattr(processor, "feature_extractor", None)
+
+    mm_inputs.update(image_processor(IMAGES, return_tensors="pt"))
+    mm_inputs.update(
+        feature_extractor(
+            AUDIOS,
+            sampling_rate=getattr(processor, "audio_sampling_rate", 16000),
+            return_attention_mask=True,
+            padding="max_length",
+            return_tensors="pt",
+        )
+    )
+    mm_inputs["feature_attention_mask"] = mm_inputs.pop("attention_mask")
+    return mm_inputs
+
+
+def _is_close(batch_a: dict[str, Any], batch_b: dict[str, Any]) -> None:
+    assert batch_a.keys() == batch_b.keys()
+    for key in batch_a.keys():
+        if isinstance(batch_a[key], torch.Tensor):
+            assert torch.allclose(batch_a[key], batch_b[key], rtol=1e-4, atol=1e-5)
+        elif isinstance(batch_a[key], list) and all(isinstance(item, torch.Tensor) for item in batch_a[key]):
+            assert len(batch_a[key]) == len(batch_b[key])
+            for tensor_a, tensor_b in zip(batch_a[key], batch_b[key]):
+                assert torch.allclose(tensor_a, tensor_b, rtol=1e-4, atol=1e-5)
+        else:
+            assert batch_a[key] == batch_b[key]
+
+
+def _load_tokenizer_module(model_name_or_path: str) -> "TokenizerModule":
+    model_args, *_ = get_infer_args({"model_name_or_path": model_name_or_path, "template": "default"})
+    return load_tokenizer(model_args)
+
+
+def _check_plugin(
+    plugin: "BasePlugin",
+    tokenizer: "PreTrainedTokenizer",
+    processor: "ProcessorMixin",
+    expected_mm_messages: list[dict[str, str]] = MM_MESSAGES,
+    expected_input_ids: list[int] = INPUT_IDS,
+    expected_labels: list[int] = LABELS,
+    expected_mm_inputs: dict[str, Any] = {},
+    expected_no_mm_inputs: dict[str, Any] = {},
+) -> None:
+    if plugin.__class__.__name__ == "Qwen2OmniPlugin":  # test omni_messages
+        assert plugin.process_messages(OMNI_MESSAGES, IMAGES, NO_VIDEOS, AUDIOS, processor) == expected_mm_messages
+        assert plugin.process_token_ids(INPUT_IDS, LABELS, IMAGES, NO_VIDEOS, AUDIOS, tokenizer, processor) == (
+            expected_input_ids,
+            expected_labels,
+        )
+        _is_close(
+            plugin.get_mm_inputs(IMAGES, NO_VIDEOS, AUDIOS, IMGLENS, NO_VIDLENS, AUDLENS, BATCH_IDS, processor),
+            expected_mm_inputs,
+        )
+    elif plugin.__class__.__name__ == "Qwen3VLPlugin":  # only check replacement
+        assert plugin.process_messages(VIDEO_MESSAGES, NO_IMAGES, VIDEOS, NO_AUDIOS, processor) == expected_mm_messages
+    elif plugin.__class__.__name__ != "BasePlugin":  # test mm_messages
+        assert plugin.process_messages(MM_MESSAGES, IMAGES, NO_VIDEOS, NO_AUDIOS, processor) == expected_mm_messages
+        assert plugin.process_token_ids(INPUT_IDS, LABELS, IMAGES, NO_VIDEOS, NO_AUDIOS, tokenizer, processor) == (
+            expected_input_ids,
+            expected_labels,
+        )
+        _is_close(
+            plugin.get_mm_inputs(IMAGES, NO_VIDEOS, NO_AUDIOS, IMGLENS, NO_VIDLENS, NO_AUDLENS, BATCH_IDS, processor),
+            expected_mm_inputs,
+        )
+
+    # test text_messages
+    assert plugin.process_messages(TEXT_MESSAGES, NO_IMAGES, NO_VIDEOS, NO_AUDIOS, processor) == TEXT_MESSAGES
+    assert plugin.process_token_ids(INPUT_IDS, LABELS, NO_IMAGES, NO_VIDEOS, NO_AUDIOS, tokenizer, processor) == (
+        INPUT_IDS,
+        LABELS,
+    )
+    _is_close(
+        plugin.get_mm_inputs(
+            NO_IMAGES, NO_VIDEOS, NO_AUDIOS, NO_IMGLENS, NO_VIDLENS, NO_AUDLENS, BATCH_IDS, processor
+        ),
+        expected_no_mm_inputs,
+    )
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_base_plugin():
+    tokenizer_module = _load_tokenizer_module(model_name_or_path=TINY_LLAMA3)
+    base_plugin = get_mm_plugin(name="base")
+    check_inputs = {"plugin": base_plugin, **tokenizer_module}
+    _check_plugin(**check_inputs)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
+@pytest.mark.skipif(not is_transformers_version_greater_than("4.50.0"), reason="Requires transformers>=4.50.0")
+def test_gemma3_plugin():
+    image_seqlen = 256
+    tokenizer_module = _load_tokenizer_module(model_name_or_path="google/gemma-3-4b-it")
+    gemma3_plugin = get_mm_plugin(name="gemma3", image_token="<image_soft_token>")
+    image_tokens_expanded = "<image_soft_token>" * image_seqlen
+    check_inputs = {"plugin": gemma3_plugin, **tokenizer_module}
+    check_inputs["expected_mm_messages"] = [
+        {
+            key: value.replace("<image>", f"\n\n<start_of_image>{image_tokens_expanded}<end_of_image>\n\n")
+            for key, value in message.items()
+        }
+        for message in MM_MESSAGES
+    ]
+    check_inputs["expected_mm_inputs"] = _get_mm_inputs(tokenizer_module["processor"])
+    check_inputs["expected_mm_inputs"].pop("num_crops")
+    check_inputs["expected_mm_inputs"]["token_type_ids"] = [[0] * 1024]
+    check_inputs["expected_no_mm_inputs"] = {"token_type_ids": [[0] * 1024]}
+    _check_plugin(**check_inputs)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.skipif(not is_transformers_version_greater_than("4.52.0"), reason="Requires transformers>=4.52.0")
+def test_internvl_plugin():
+    image_seqlen = 256
+    tokenizer_module = _load_tokenizer_module(model_name_or_path="OpenGVLab/InternVL3-1B-hf")
+    internvl_plugin = get_mm_plugin("intern_vl", image_token="<image>", video_token="<video>")
+    check_inputs = {"plugin": internvl_plugin, **tokenizer_module}
+    check_inputs["expected_mm_messages"] = [
+        {
+            key: value.replace("<image>", f"<img>{'<IMG_CONTEXT>' * image_seqlen * 1}</img>")
+            for key, value in message.items()
+        }
+        for message in MM_MESSAGES
+    ]
+    check_inputs["expected_mm_inputs"] = _get_mm_inputs(tokenizer_module["processor"])
+    check_inputs["expected_mm_inputs"].pop("num_patches", None)
+    _check_plugin(**check_inputs)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.skipif(not is_transformers_version_greater_than("4.51.0"), reason="Requires transformers>=4.51.0")
+def test_llama4_plugin():
+    tokenizer_module = _load_tokenizer_module(model_name_or_path=TINY_LLAMA4)
+    processor = tokenizer_module["processor"]
+    llama4_plugin = get_mm_plugin(name="llama4", image_token="<|image|>")
+    check_inputs = {"plugin": llama4_plugin, **tokenizer_module}
+    mm_inputs = _get_mm_inputs(tokenizer_module["processor"])
+    image_height, image_width = mm_inputs["pixel_values"][0].shape[-2:]
+    num_patches_per_chunk = int(
+        (image_height // processor.patch_size) * (image_width // processor.patch_size) // processor.downsample_ratio
+    )
+    aspect_ratios = mm_inputs.pop("aspect_ratios")
+    tokens_for_this_image = processor._prompt_split_image(aspect_ratios[0], num_patches_per_chunk)
+    check_inputs["expected_mm_messages"] = [
+        {key: value.replace("<image>", tokens_for_this_image) for key, value in message.items()}
+        for message in MM_MESSAGES
+    ]
+    check_inputs["expected_mm_inputs"] = mm_inputs
+    _check_plugin(**check_inputs)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_llava_plugin():
+    image_seqlen = 576
+    tokenizer_module = _load_tokenizer_module(model_name_or_path="llava-hf/llava-1.5-7b-hf")
+    llava_plugin = get_mm_plugin(name="llava", image_token="<image>")
+    check_inputs = {"plugin": llava_plugin, **tokenizer_module}
+    check_inputs["expected_mm_messages"] = [
+        {key: value.replace("<image>", "<image>" * image_seqlen) for key, value in message.items()}
+        for message in MM_MESSAGES
+    ]
+    check_inputs["expected_mm_inputs"] = _get_mm_inputs(tokenizer_module["processor"])
+    _check_plugin(**check_inputs)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_llava_next_plugin():
+    image_seqlen = 1176
+    tokenizer_module = _load_tokenizer_module(model_name_or_path="llava-hf/llava-v1.6-vicuna-7b-hf")
+    llava_next_plugin = get_mm_plugin(name="llava_next", image_token="<image>")
+    check_inputs = {"plugin": llava_next_plugin, **tokenizer_module}
+    check_inputs["expected_mm_messages"] = [
+        {key: value.replace("<image>", "<image>" * image_seqlen) for key, value in message.items()}
+        for message in MM_MESSAGES
+    ]
+    check_inputs["expected_mm_inputs"] = _get_mm_inputs(tokenizer_module["processor"])
+    _check_plugin(**check_inputs)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_llava_next_video_plugin():
+    image_seqlen = 1176
+    tokenizer_module = _load_tokenizer_module(model_name_or_path="llava-hf/LLaVA-NeXT-Video-7B-hf")
+    llava_next_video_plugin = get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>")
+    check_inputs = {"plugin": llava_next_video_plugin, **tokenizer_module}
+    check_inputs["expected_mm_messages"] = [
+        {key: value.replace("<image>", "<image>" * image_seqlen) for key, value in message.items()}
+        for message in MM_MESSAGES
+    ]
+    check_inputs["expected_mm_inputs"] = _get_mm_inputs(tokenizer_module["processor"])
+    _check_plugin(**check_inputs)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
+def test_paligemma_plugin():
+    image_seqlen = 256
+    tokenizer_module = _load_tokenizer_module(model_name_or_path="google/paligemma-3b-pt-224")
+    paligemma_plugin = get_mm_plugin(name="paligemma", image_token="<image>")
+    check_inputs = {"plugin": paligemma_plugin, **tokenizer_module}
+    check_inputs["expected_mm_messages"] = [
+        {key: value.replace("<image>", "") for key, value in message.items()} for message in MM_MESSAGES
+    ]
+    check_inputs["expected_input_ids"] = [
+        tokenizer_module["tokenizer"].convert_tokens_to_ids(paligemma_plugin.image_token)
+    ] * image_seqlen + INPUT_IDS
+    check_inputs["expected_labels"] = [-100] * image_seqlen + LABELS
+    check_inputs["expected_mm_inputs"] = _get_mm_inputs(tokenizer_module["processor"])
+    check_inputs["expected_mm_inputs"]["token_type_ids"] = [[0] * image_seqlen + [1] * (1024 - image_seqlen)]
+    check_inputs["expected_no_mm_inputs"] = {"token_type_ids": [[1] * 1024]}
+    _check_plugin(**check_inputs)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.skipif(not is_transformers_version_greater_than("4.50.0"), reason="Requires transformers>=4.50.0")
+def test_pixtral_plugin():
+    image_slice_height, image_slice_width = 2, 2
+    tokenizer_module = _load_tokenizer_module(model_name_or_path="mistral-community/pixtral-12b")
+    pixtral_plugin = get_mm_plugin(name="pixtral", image_token="[IMG]")
+    check_inputs = {"plugin": pixtral_plugin, **tokenizer_module}
+    check_inputs["expected_mm_messages"] = [
+        {
+            key: value.replace(
+                "<image>",
+                ("{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_height).rsplit("[IMG_BREAK]", 1)[0]
+                + "[IMG_END]",
+            )
+            for key, value in message.items()
+        }
+        for message in MM_MESSAGES
+    ]
+    check_inputs["expected_mm_inputs"] = _get_mm_inputs(tokenizer_module["processor"])
+    check_inputs["expected_mm_inputs"]["pixel_values"] = check_inputs["expected_mm_inputs"]["pixel_values"][0]
+    _check_plugin(**check_inputs)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.skipif(not is_transformers_version_greater_than("4.52.0"), reason="Requires transformers>=4.52.0")
+def test_qwen2_omni_plugin():
+    image_seqlen, audio_seqlen = 4, 2
+    tokenizer_module = _load_tokenizer_module(model_name_or_path="Qwen/Qwen2.5-Omni-7B")
+    qwen2_omni_plugin = get_mm_plugin(
+        name="qwen2_omni",
+        image_token="<|IMAGE|>",
+        video_token="<|VIDEO|>",
+        audio_token="<|AUDIO|>",
+        vision_bos_token="<|vision_bos|>",
+        vision_eos_token="<|vision_eos|>",
+        audio_bos_token="<|audio_bos|>",
+        audio_eos_token="<|audio_eos|>",
+    )
+    check_inputs = {"plugin": qwen2_omni_plugin, **tokenizer_module}
+    check_inputs["expected_mm_messages"] = [
+        {
+            key: (
+                value.replace("<image>", f"<|vision_bos|>{'<|IMAGE|>' * image_seqlen}<|vision_eos|>").replace(
+                    "<audio>", f"<|audio_bos|>{'<|AUDIO|>' * audio_seqlen}<|audio_eos|>"
+                )
+            )
+            for key, value in message.items()
+        }
+        for message in OMNI_MESSAGES
+    ]
+    check_inputs["expected_mm_inputs"] = _get_omni_inputs(tokenizer_module["processor"])
+    _check_plugin(**check_inputs)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_qwen2_vl_plugin():
+    image_seqlen = 4
+    tokenizer_module = _load_tokenizer_module(model_name_or_path="Qwen/Qwen2-VL-7B-Instruct")
+    qwen2_vl_plugin = get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>")
+    check_inputs = {"plugin": qwen2_vl_plugin, **tokenizer_module}
+    check_inputs["expected_mm_messages"] = [
+        {
+            key: value.replace("<image>", "<|vision_start|>{}<|vision_end|>".format("<|image_pad|>" * image_seqlen))
+            for key, value in message.items()
+        }
+        for message in MM_MESSAGES
+    ]
+    check_inputs["expected_mm_inputs"] = _get_mm_inputs(tokenizer_module["processor"])
+    _check_plugin(**check_inputs)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.skipif(not is_transformers_version_greater_than("4.57.0"), reason="Requires transformers>=4.57.0")
+def test_qwen3_vl_plugin():
+    frame_seqlen = 1
+    tokenizer_module = _load_tokenizer_module(model_name_or_path="Qwen/Qwen3-VL-30B-A3B-Instruct")
+    qwen3_vl_plugin = get_mm_plugin(name="qwen3_vl", video_token="<|video_pad|>")
+    check_inputs = {"plugin": qwen3_vl_plugin, **tokenizer_module}
+    check_inputs["expected_mm_messages"] = [
+        {
+            key: value.replace(
+                "<video>",  # little different with original processor for default `fps=2` in our repo
+                "<0.2 seconds><|vision_start|>{}<|vision_end|><1.2 seconds><|vision_start|>{}<|vision_end|>".format(
+                    "<|video_pad|>" * frame_seqlen, "<|video_pad|>" * frame_seqlen
+                ),
+            )
+            for key, value in message.items()
+        }
+        for message in VIDEO_MESSAGES
+    ]
+    _check_plugin(**check_inputs)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.skipif(not is_transformers_version_greater_than("4.47.0"), reason="Requires transformers>=4.47.0")
+def test_video_llava_plugin():
+    image_seqlen = 256
+    tokenizer_module = _load_tokenizer_module(model_name_or_path="LanguageBind/Video-LLaVA-7B-hf")
+    video_llava_plugin = get_mm_plugin(name="video_llava", image_token="<image>", video_token="<video>")
+    check_inputs = {"plugin": video_llava_plugin, **tokenizer_module}
+    check_inputs["expected_mm_messages"] = [
+        {key: value.replace("<image>", "<image>" * image_seqlen) for key, value in message.items()}
+        for message in MM_MESSAGES
+    ]
+    check_inputs["expected_mm_inputs"] = _get_mm_inputs(tokenizer_module["processor"])
+    _check_plugin(**check_inputs)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_vl_plugin():
+    """Test LFM2.5-VL plugin instantiation."""
+    # Test plugin can be instantiated with correct tokens
+    lfm2_vl_plugin = get_mm_plugin(name="lfm2_vl", image_token="<image>")
+    assert lfm2_vl_plugin is not None
+    assert lfm2_vl_plugin.image_token == "<image>"
+    assert lfm2_vl_plugin.video_token is None
+    assert lfm2_vl_plugin.audio_token is None
+    assert lfm2_vl_plugin.__class__.__name__ == "LFMVLPlugin"
diff --git a/LlamaFactory/tests/data/test_template.py b/LlamaFactory/tests/data/test_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9d9ab2d8bf82635c7b476ba7d35ed7e77cbf7f1
--- /dev/null
+++ b/LlamaFactory/tests/data/test_template.py
@@ -0,0 +1,353 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import TYPE_CHECKING
+
+import pytest
+from transformers import AutoTokenizer
+
+from llamafactory.data import get_template_and_fix_tokenizer
+from llamafactory.data.template import parse_template
+from llamafactory.extras.packages import is_transformers_version_greater_than
+from llamafactory.hparams import DataArguments
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+
+HF_TOKEN = os.getenv("HF_TOKEN")
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA4 = os.getenv("TINY_LLAMA4", "llamafactory/tiny-random-Llama-4")
+
+MESSAGES = [
+    {"role": "user", "content": "How are you"},
+    {"role": "assistant", "content": "I am fine!"},
+    {"role": "user", "content": "你好"},
+    {"role": "assistant", "content": "很高兴认识你！"},
+]
+
+MESSAGES_WITH_THOUGHT = [
+    {"role": "user", "content": "How are you"},
+    {"role": "assistant", "content": "<think>\nModel thought here\n</think>\n\nI am fine!"},
+    {"role": "user", "content": "你好"},
+    {"role": "assistant", "content": "<think>\n模型思考内容\n</think>\n\n很高兴认识你！"},
+]
+
+
+def _check_tokenization(
+    tokenizer: "PreTrainedTokenizer", batch_input_ids: list[list[int]], batch_text: list[str]
+) -> None:
+    r"""Check token ids and texts.
+
+    encode(text) == token_ids
+    decode(token_ids) == text
+    """
+    for input_ids, text in zip(batch_input_ids, batch_text):
+        assert tokenizer.encode(text, add_special_tokens=False) == input_ids
+        assert tokenizer.decode(input_ids) == text
+
+
+def _check_template(
+    model_id: str,
+    template_name: str,
+    prompt_str: str,
+    answer_str: str,
+    messages: list[dict[str, str]] = MESSAGES,
+) -> None:
+    r"""Check template.
+
+    Args:
+        model_id: the model id on hugging face hub.
+        template_name: the template name.
+        prompt_str: the string corresponding to the prompt part.
+        answer_str: the string corresponding to the answer part.
+        messages: the list of messages.
+
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    content_str = tokenizer.apply_chat_template(messages, tokenize=False)
+    content_ids = tokenizer.apply_chat_template(messages, tokenize=True)
+    if is_transformers_version_greater_than("5.0.0"):
+        content_ids = content_ids["input_ids"]
+
+    template = get_template_and_fix_tokenizer(tokenizer, DataArguments(template=template_name))
+    prompt_ids, answer_ids = template.encode_oneturn(tokenizer, messages)
+    assert content_str == prompt_str + answer_str
+    assert content_ids == prompt_ids + answer_ids
+    _check_tokenization(tokenizer, (prompt_ids, answer_ids), (prompt_str, answer_str))
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_encode_oneturn():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    template = get_template_and_fix_tokenizer(tokenizer, DataArguments(template="llama3"))
+    prompt_ids, answer_ids = template.encode_oneturn(tokenizer, MESSAGES)
+    prompt_str = (
+        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\nI am fine!<|eot_id|>"
+        "<|start_header_id|>user<|end_header_id|>\n\n你好<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    answer_str = "很高兴认识你！<|eot_id|>"
+    _check_tokenization(tokenizer, (prompt_ids, answer_ids), (prompt_str, answer_str))
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_encode_multiturn():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    template = get_template_and_fix_tokenizer(tokenizer, DataArguments(template="llama3"))
+    encoded_pairs = template.encode_multiturn(tokenizer, MESSAGES)
+    prompt_str_1 = (
+        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    answer_str_1 = "I am fine!<|eot_id|>"
+    prompt_str_2 = (
+        "<|start_header_id|>user<|end_header_id|>\n\n你好<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    answer_str_2 = "很高兴认识你！<|eot_id|>"
+    _check_tokenization(
+        tokenizer,
+        (encoded_pairs[0][0], encoded_pairs[0][1], encoded_pairs[1][0], encoded_pairs[1][1]),
+        (prompt_str_1, answer_str_1, prompt_str_2, answer_str_2),
+    )
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.parametrize("cot_messages", [True, False])
+@pytest.mark.parametrize("enable_thinking", [True, False, None])
+def test_reasoning_encode_oneturn(cot_messages: bool, enable_thinking: bool):
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
+    data_args = DataArguments(template="qwen3", enable_thinking=enable_thinking)
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    prompt_ids, answer_ids = template.encode_oneturn(tokenizer, MESSAGES_WITH_THOUGHT if cot_messages else MESSAGES)
+
+    prompt_str = (
+        f"<|im_start|>user\n{MESSAGES[0]['content']}<|im_end|>\n<|im_start|>assistant\n"
+        f"{MESSAGES[1]['content']}<|im_end|>\n"
+        f"<|im_start|>user\n{MESSAGES[2]['content']}<|im_end|>\n<|im_start|>assistant\n"
+    )
+    if not cot_messages or enable_thinking is False:
+        answer_str = f"{MESSAGES[3]['content']}<|im_end|>\n"
+        if enable_thinking:
+            answer_str = "<think>\n\n</think>\n\n" + answer_str
+        else:
+            prompt_str = prompt_str + "<think>\n\n</think>\n\n"
+    else:
+        answer_str = f"{MESSAGES_WITH_THOUGHT[3]['content']}<|im_end|>\n"
+
+    _check_tokenization(tokenizer, (prompt_ids, answer_ids), (prompt_str, answer_str))
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.parametrize("cot_messages", [True, False])
+@pytest.mark.parametrize("enable_thinking", [True, False, None])
+def test_reasoning_encode_multiturn(cot_messages: bool, enable_thinking: bool):
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
+    data_args = DataArguments(template="qwen3", enable_thinking=enable_thinking)
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    encoded_pairs = template.encode_multiturn(tokenizer, MESSAGES_WITH_THOUGHT if cot_messages else MESSAGES)
+
+    messages = MESSAGES if not cot_messages or enable_thinking is False else MESSAGES_WITH_THOUGHT
+    prompt_str_1 = f"<|im_start|>user\n{MESSAGES[0]['content']}<|im_end|>\n<|im_start|>assistant\n"
+    answer_str_1 = f"{messages[1]['content']}<|im_end|>\n"
+    prompt_str_2 = f"<|im_start|>user\n{MESSAGES[2]['content']}<|im_end|>\n<|im_start|>assistant\n"
+    answer_str_2 = f"{messages[3]['content']}<|im_end|>\n"
+    if not cot_messages or enable_thinking is False:
+        if enable_thinking:
+            answer_str_1 = "<think>\n\n</think>\n\n" + answer_str_1
+            answer_str_2 = "<think>\n\n</think>\n\n" + answer_str_2
+        else:
+            prompt_str_1 = prompt_str_1 + "<think>\n\n</think>\n\n"
+            prompt_str_2 = prompt_str_2 + "<think>\n\n</think>\n\n"
+
+    _check_tokenization(
+        tokenizer,
+        (encoded_pairs[0][0], encoded_pairs[0][1], encoded_pairs[1][0], encoded_pairs[1][1]),
+        (prompt_str_1, answer_str_1, prompt_str_2, answer_str_2),
+    )
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_jinja_template():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    template = get_template_and_fix_tokenizer(tokenizer, DataArguments(template="llama3"))
+    tokenizer.chat_template = template._get_jinja_template(tokenizer)  # llama3 template no replace
+    assert tokenizer.chat_template != ref_tokenizer.chat_template
+    assert tokenizer.apply_chat_template(MESSAGES) == ref_tokenizer.apply_chat_template(MESSAGES)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_ollama_modelfile():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    template = get_template_and_fix_tokenizer(tokenizer, DataArguments(template="llama3"))
+    assert template.get_ollama_modelfile(tokenizer) == (
+        "# ollama modelfile auto-generated by llamafactory\n\n"
+        "FROM .\n\n"
+        'TEMPLATE """<|begin_of_text|>'
+        "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}"
+        '{{ range .Messages }}{{ if eq .Role "user" }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Content }}'
+        "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+        '{{ else if eq .Role "assistant" }}{{ .Content }}<|eot_id|>{{ end }}{{ end }}"""\n\n'
+        'PARAMETER stop "<|eom_id|>"\n'
+        'PARAMETER stop "<|eot_id|>"\n'
+        "PARAMETER num_ctx 4096\n"
+    )
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_get_stop_token_ids():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    template = get_template_and_fix_tokenizer(tokenizer, DataArguments(template="llama3"))
+    assert set(template.get_stop_token_ids(tokenizer)) == {128008, 128009}
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
+def test_gemma_template():
+    prompt_str = (
+        f"<bos><start_of_turn>user\n{MESSAGES[0]['content']}<end_of_turn>\n"
+        f"<start_of_turn>model\n{MESSAGES[1]['content']}<end_of_turn>\n"
+        f"<start_of_turn>user\n{MESSAGES[2]['content']}<end_of_turn>\n"
+        "<start_of_turn>model\n"
+    )
+    answer_str = f"{MESSAGES[3]['content']}<end_of_turn>\n"
+    _check_template("google/gemma-3-4b-it", "gemma", prompt_str, answer_str)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
+def test_gemma2_template():
+    prompt_str = (
+        f"<bos><start_of_turn>user\n{MESSAGES[0]['content']}<end_of_turn>\n"
+        f"<start_of_turn>model\n{MESSAGES[1]['content']}<end_of_turn>\n"
+        f"<start_of_turn>user\n{MESSAGES[2]['content']}<end_of_turn>\n"
+        "<start_of_turn>model\n"
+    )
+    answer_str = f"{MESSAGES[3]['content']}<end_of_turn>\n"
+    _check_template("google/gemma-2-2b-it", "gemma2", prompt_str, answer_str)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
+def test_llama3_template():
+    prompt_str = (
+        f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{MESSAGES[0]['content']}<|eot_id|>"
+        f"<|start_header_id|>assistant<|end_header_id|>\n\n{MESSAGES[1]['content']}<|eot_id|>"
+        f"<|start_header_id|>user<|end_header_id|>\n\n{MESSAGES[2]['content']}<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    answer_str = f"{MESSAGES[3]['content']}<|eot_id|>"
+    _check_template("meta-llama/Meta-Llama-3-8B-Instruct", "llama3", prompt_str, answer_str)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_llama4_template():
+    prompt_str = (
+        f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{MESSAGES[0]['content']}<|eot|>"
+        f"<|header_start|>assistant<|header_end|>\n\n{MESSAGES[1]['content']}<|eot|>"
+        f"<|header_start|>user<|header_end|>\n\n{MESSAGES[2]['content']}<|eot|>"
+        "<|header_start|>assistant<|header_end|>\n\n"
+    )
+    answer_str = f"{MESSAGES[3]['content']}<|eot|>"
+    _check_template(TINY_LLAMA4, "llama4", prompt_str, answer_str)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_phi4_template():
+    prompt_str = (
+        f"<|im_start|>user<|im_sep|>{MESSAGES[0]['content']}<|im_end|>"
+        f"<|im_start|>assistant<|im_sep|>{MESSAGES[1]['content']}<|im_end|>"
+        f"<|im_start|>user<|im_sep|>{MESSAGES[2]['content']}<|im_end|>"
+        "<|im_start|>assistant<|im_sep|>"
+    )
+    answer_str = f"{MESSAGES[3]['content']}<|im_end|>"
+    _check_template("microsoft/phi-4", "phi4", prompt_str, answer_str)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.xfail(not HF_TOKEN, reason="Authorization.")
+def test_qwen2_5_template():
+    prompt_str = (
+        "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n"
+        f"<|im_start|>user\n{MESSAGES[0]['content']}<|im_end|>\n"
+        f"<|im_start|>assistant\n{MESSAGES[1]['content']}<|im_end|>\n"
+        f"<|im_start|>user\n{MESSAGES[2]['content']}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    answer_str = f"{MESSAGES[3]['content']}<|im_end|>\n"
+    _check_template("Qwen/Qwen2.5-7B-Instruct", "qwen", prompt_str, answer_str)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.parametrize("cot_messages", [True, False])
+def test_qwen3_template(cot_messages: bool):
+    prompt_str = (
+        f"<|im_start|>user\n{MESSAGES[0]['content']}<|im_end|>\n"
+        f"<|im_start|>assistant\n{MESSAGES[1]['content']}<|im_end|>\n"
+        f"<|im_start|>user\n{MESSAGES[2]['content']}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    if not cot_messages:
+        answer_str = f"<think>\n\n</think>\n\n{MESSAGES[3]['content']}<|im_end|>\n"
+        messages = MESSAGES
+    else:
+        answer_str = f"{MESSAGES_WITH_THOUGHT[3]['content']}<|im_end|>\n"
+        messages = MESSAGES_WITH_THOUGHT
+
+    _check_template("Qwen/Qwen3-8B", "qwen3", prompt_str, answer_str, messages=messages)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_parse_llama3_template():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    template = parse_template(tokenizer)
+    assert template.format_user.slots == [
+        "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    ]
+    assert template.format_assistant.slots == ["{{content}}<|eot_id|>"]
+    assert template.format_system.slots == ["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]
+    assert template.format_prefix.slots == ["<|begin_of_text|>"]
+    assert template.default_system == ""
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.xfail(not HF_TOKEN, reason="Authorization.")
+def test_parse_qwen_template():
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
+    template = parse_template(tokenizer)
+    assert template.__class__.__name__ == "Template"
+    assert template.format_user.slots == ["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
+    assert template.format_assistant.slots == ["{{content}}<|im_end|>\n"]
+    assert template.format_system.slots == ["<|im_start|>system\n{{content}}<|im_end|>\n"]
+    assert template.format_prefix.slots == []
+    assert template.default_system == "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.xfail(not HF_TOKEN, reason="Authorization.")
+def test_parse_qwen3_template():
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
+    template = parse_template(tokenizer)
+    assert template.__class__.__name__ == "ReasoningTemplate"
+    assert template.format_user.slots == ["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
+    assert template.format_assistant.slots == ["{{content}}<|im_end|>\n"]
+    assert template.format_system.slots == ["<|im_start|>system\n{{content}}<|im_end|>\n"]
+    assert template.format_prefix.slots == []
+    assert template.default_system == ""
diff --git a/LlamaFactory/tests/e2e/test_chat.py b/LlamaFactory/tests/e2e/test_chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..e33f32c5646f14696304aa7de51dbb85fe1ce9ea
--- /dev/null
+++ b/LlamaFactory/tests/e2e/test_chat.py
@@ -0,0 +1,53 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from llamafactory.chat import ChatModel
+
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "finetuning_type": "lora",
+    "template": "llama3",
+    "infer_dtype": "float16",
+    "do_sample": False,
+    "max_new_tokens": 1,
+}
+
+MESSAGES = [
+    {"role": "user", "content": "Hi"},
+]
+
+EXPECTED_RESPONSE = "_rho"
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_chat():
+    chat_model = ChatModel(INFER_ARGS)
+    assert chat_model.chat(MESSAGES)[0].response_text == EXPECTED_RESPONSE
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_stream_chat():
+    chat_model = ChatModel(INFER_ARGS)
+    response = ""
+    for token in chat_model.stream_chat(MESSAGES):
+        response += token
+
+    assert response == EXPECTED_RESPONSE
diff --git a/LlamaFactory/tests/e2e/test_sglang.py b/LlamaFactory/tests/e2e/test_sglang.py
new file mode 100644
index 0000000000000000000000000000000000000000..7182ed382c331c5103211688e3b9747f1427052e
--- /dev/null
+++ b/LlamaFactory/tests/e2e/test_sglang.py
@@ -0,0 +1,73 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import pytest
+
+from llamafactory.chat import ChatModel
+from llamafactory.extras.packages import is_sglang_available
+
+
+MODEL_NAME = "Qwen/Qwen2.5-0.5B"
+
+
+INFER_ARGS = {
+    "model_name_or_path": MODEL_NAME,
+    "finetuning_type": "lora",
+    "template": "llama3",
+    "infer_dtype": "float16",
+    "infer_backend": "sglang",
+    "do_sample": False,
+    "max_new_tokens": 1,
+}
+
+
+MESSAGES = [
+    {"role": "user", "content": "Hi"},
+]
+
+
+@pytest.mark.runs_on(["cuda"])
+@pytest.mark.skipif(not is_sglang_available(), reason="SGLang is not installed")
+def test_chat():
+    r"""Test the SGLang engine's basic chat functionality."""
+    chat_model = ChatModel(INFER_ARGS)
+    response = chat_model.chat(MESSAGES)[0]
+    # TODO: Change to EXPECTED_RESPONSE
+    print(response.response_text)
+
+
+@pytest.mark.runs_on(["cuda"])
+@pytest.mark.skipif(not is_sglang_available(), reason="SGLang is not installed")
+def test_stream_chat():
+    r"""Test the SGLang engine's streaming chat functionality."""
+    chat_model = ChatModel(INFER_ARGS)
+
+    response = ""
+    for token in chat_model.stream_chat(MESSAGES):
+        response += token
+
+    print("Complete response:", response)
+    assert response, "Should receive a non-empty response"
+
+
+# Run tests if executed directly
+if __name__ == "__main__":
+    if not is_sglang_available():
+        print("SGLang is not available. Please install it.")
+        sys.exit(1)
+
+    test_chat()
+    test_stream_chat()
diff --git a/LlamaFactory/tests/e2e/test_train.py b/LlamaFactory/tests/e2e/test_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..dabb888b9f042986da39f3bb9abf5126114bf3d5
--- /dev/null
+++ b/LlamaFactory/tests/e2e/test_train.py
@@ -0,0 +1,73 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from llamafactory.train.tuner import export_model, run_exp
+
+
+DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data")
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+TINY_LLAMA_ADAPTER = os.getenv("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "do_train": True,
+    "finetuning_type": "lora",
+    "dataset_dir": "REMOTE:" + DEMO_DATA,
+    "template": "llama3",
+    "cutoff_len": 1,
+    "overwrite_output_dir": True,
+    "per_device_train_batch_size": 1,
+    "max_steps": 1,
+    "report_to": "none",
+}
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "adapter_name_or_path": TINY_LLAMA_ADAPTER,
+    "finetuning_type": "lora",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
+
+OS_NAME = os.getenv("OS_NAME", "")
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+@pytest.mark.parametrize(
+    "stage,dataset",
+    [
+        ("pt", "c4_demo"),
+        ("sft", "alpaca_en_demo"),
+        ("dpo", "dpo_en_demo"),
+        ("kto", "kto_en_demo"),
+        pytest.param("rm", "dpo_en_demo", marks=pytest.mark.xfail(OS_NAME.startswith("windows"), reason="OS error.")),
+    ],
+)
+def test_run_exp(stage: str, dataset: str):
+    output_dir = os.path.join("output", f"train_{stage}")
+    run_exp({"stage": stage, "dataset": dataset, "output_dir": output_dir, **TRAIN_ARGS})
+    assert os.path.exists(output_dir)
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_export():
+    export_dir = os.path.join("output", "llama3_export")
+    export_model({"export_dir": export_dir, **INFER_ARGS})
+    assert os.path.exists(export_dir)
diff --git a/LlamaFactory/tests/eval/test_eval_template.py b/LlamaFactory/tests/eval/test_eval_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..783d0b9e3d5b479ae48873486254d5ebceb77e42
--- /dev/null
+++ b/LlamaFactory/tests/eval/test_eval_template.py
@@ -0,0 +1,95 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from llamafactory.eval.template import get_eval_template
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_eval_template_en():
+    support_set = [
+        {
+            "question": "Fewshot question",
+            "A": "Fewshot1",
+            "B": "Fewshot2",
+            "C": "Fewshot3",
+            "D": "Fewshot4",
+            "answer": "B",
+        }
+    ]
+    example = {
+        "question": "Target question",
+        "A": "Target1",
+        "B": "Target2",
+        "C": "Target3",
+        "D": "Target4",
+        "answer": "C",
+    }
+    template = get_eval_template(name="en")
+    messages = template.format_example(example, support_set=support_set, subject_name="SubName")
+    assert messages == [
+        {
+            "role": "user",
+            "content": (
+                "The following are multiple choice questions (with answers) about SubName.\n\n"
+                "Fewshot question\nA. Fewshot1\nB. Fewshot2\nC. Fewshot3\nD. Fewshot4\nAnswer:"
+            ),
+        },
+        {"role": "assistant", "content": "B"},
+        {
+            "role": "user",
+            "content": "Target question\nA. Target1\nB. Target2\nC. Target3\nD. Target4\nAnswer:",
+        },
+        {"role": "assistant", "content": "C"},
+    ]
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_eval_template_zh():
+    support_set = [
+        {
+            "question": "示例问题",
+            "A": "示例答案1",
+            "B": "示例答案2",
+            "C": "示例答案3",
+            "D": "示例答案4",
+            "answer": "B",
+        }
+    ]
+    example = {
+        "question": "目标问题",
+        "A": "目标答案1",
+        "B": "目标答案2",
+        "C": "目标答案3",
+        "D": "目标答案4",
+        "answer": "C",
+    }
+    template = get_eval_template(name="zh")
+    messages = template.format_example(example, support_set=support_set, subject_name="主题")
+    assert messages == [
+        {
+            "role": "user",
+            "content": (
+                "以下是中国关于主题考试的单项选择题，请选出其中的正确答案。\n\n"
+                "示例问题\nA. 示例答案1\nB. 示例答案2\nC. 示例答案3\nD. 示例答案4\n答案："
+            ),
+        },
+        {"role": "assistant", "content": "B"},
+        {
+            "role": "user",
+            "content": "目标问题\nA. 目标答案1\nB. 目标答案2\nC. 目标答案3\nD. 目标答案4\n答案：",
+        },
+        {"role": "assistant", "content": "C"},
+    ]
diff --git a/LlamaFactory/tests/model/model_utils/test_add_tokens.py b/LlamaFactory/tests/model/model_utils/test_add_tokens.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb1c414abd3b812083b91fd77d7214e0f6b72783
--- /dev/null
+++ b/LlamaFactory/tests/model/model_utils/test_add_tokens.py
@@ -0,0 +1,46 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from llamafactory.hparams import ModelArguments
+from llamafactory.model import load_tokenizer
+
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+UNUSED_TOKEN = "<|UNUSED_TOKEN|>"
+
+
+@pytest.mark.parametrize("special_tokens", [False, True])
+def test_add_tokens(special_tokens: bool):
+    if special_tokens:
+        model_args = ModelArguments(model_name_or_path=TINY_LLAMA3, add_special_tokens=UNUSED_TOKEN)
+    else:
+        model_args = ModelArguments(model_name_or_path=TINY_LLAMA3, add_tokens=UNUSED_TOKEN)
+
+    tokenizer = load_tokenizer(model_args)["tokenizer"]
+    encoded_ids = tokenizer.encode(UNUSED_TOKEN, add_special_tokens=False)
+    assert len(encoded_ids) == 1
+    decoded_str = tokenizer.decode(encoded_ids, skip_special_tokens=True)
+    if special_tokens:
+        assert decoded_str == ""
+    else:
+        assert decoded_str == UNUSED_TOKEN
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/LlamaFactory/tests/model/model_utils/test_attention.py b/LlamaFactory/tests/model/model_utils/test_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..075caeaee25dd4ea52f568acacb26a764d5677e6
--- /dev/null
+++ b/LlamaFactory/tests/model/model_utils/test_attention.py
@@ -0,0 +1,60 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+from transformers.utils import is_flash_attn_2_available
+
+
+# Compatible with Transformers v4 and Transformers v5
+try:
+    from transformers.utils import is_torch_sdpa_available
+except ImportError:
+
+    def is_torch_sdpa_available():
+        return True
+
+
+from llamafactory.extras.packages import is_transformers_version_greater_than
+from llamafactory.train.test_utils import load_infer_model
+
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "template": "llama3",
+}
+
+
+@pytest.mark.xfail(is_transformers_version_greater_than("4.48"), reason="Attention refactor.")
+def test_attention():
+    attention_available = ["disabled"]
+    if is_torch_sdpa_available():
+        attention_available.append("sdpa")
+
+    if is_flash_attn_2_available():
+        attention_available.append("fa2")
+
+    llama_attention_classes = {
+        "disabled": "LlamaAttention",
+        "sdpa": "LlamaSdpaAttention",
+        "fa2": "LlamaFlashAttention2",
+    }
+    for requested_attention in attention_available:
+        model = load_infer_model(flash_attn=requested_attention, **INFER_ARGS)
+        for module in model.modules():
+            if "Attention" in module.__class__.__name__:
+                assert module.__class__.__name__ == llama_attention_classes[requested_attention]
diff --git a/LlamaFactory/tests/model/model_utils/test_checkpointing.py b/LlamaFactory/tests/model/model_utils/test_checkpointing.py
new file mode 100644
index 0000000000000000000000000000000000000000..2402e6fb741a15cfd23dda9cae600e26c473869c
--- /dev/null
+++ b/LlamaFactory/tests/model/model_utils/test_checkpointing.py
@@ -0,0 +1,66 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+import torch
+
+from llamafactory.extras.misc import get_current_device
+from llamafactory.train.test_utils import load_train_model
+
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "lora",
+    "lora_target": "all",
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+
+@pytest.mark.parametrize("disable_gradient_checkpointing", [False, True])
+def test_vanilla_checkpointing(disable_gradient_checkpointing: bool):
+    model = load_train_model(disable_gradient_checkpointing=disable_gradient_checkpointing, **TRAIN_ARGS)
+    for module in filter(lambda m: hasattr(m, "gradient_checkpointing"), model.modules()):
+        assert getattr(module, "gradient_checkpointing") != disable_gradient_checkpointing
+
+
+def test_unsloth_gradient_checkpointing():
+    model = load_train_model(use_unsloth_gc=True, **TRAIN_ARGS)
+    for module in filter(lambda m: hasattr(m, "gradient_checkpointing"), model.modules()):
+        assert module._gradient_checkpointing_func.__self__.__name__ == "UnslothGradientCheckpointing"
+
+
+def test_upcast_layernorm():
+    model = load_train_model(upcast_layernorm=True, **TRAIN_ARGS)
+    for name, param in model.named_parameters():
+        if param.ndim == 1 and "norm" in name:
+            assert param.dtype == torch.float32
+
+
+def test_upcast_lmhead_output():
+    model = load_train_model(upcast_lmhead_output=True, **TRAIN_ARGS)
+    inputs = torch.randn((1, 16), dtype=torch.float16, device=get_current_device())
+    outputs: torch.Tensor = model.get_output_embeddings()(inputs)
+    assert outputs.dtype == torch.float32
diff --git a/LlamaFactory/tests/model/model_utils/test_misc.py b/LlamaFactory/tests/model/model_utils/test_misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2c8b3bf916fe745e2ed93b68ba87d5cafa9c15a
--- /dev/null
+++ b/LlamaFactory/tests/model/model_utils/test_misc.py
@@ -0,0 +1,43 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+
+from llamafactory.model.model_utils.misc import find_expanded_modules
+
+
+HF_TOKEN = os.getenv("HF_TOKEN")
+
+
+@pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
+def test_expanded_modules():
+    config = AutoConfig.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
+    with torch.device("meta"):
+        model = AutoModelForCausalLM.from_config(config)
+
+    expanded_modules = find_expanded_modules(model, ["q_proj", "v_proj"], num_layer_trainable=4)
+    assert expanded_modules == [
+        "model.layers.7.self_attn.q_proj",
+        "model.layers.7.self_attn.v_proj",
+        "model.layers.15.self_attn.q_proj",
+        "model.layers.15.self_attn.v_proj",
+        "model.layers.23.self_attn.q_proj",
+        "model.layers.23.self_attn.v_proj",
+        "model.layers.31.self_attn.q_proj",
+        "model.layers.31.self_attn.v_proj",
+    ]
diff --git a/LlamaFactory/tests/model/model_utils/test_packing.py b/LlamaFactory/tests/model/model_utils/test_packing.py
new file mode 100644
index 0000000000000000000000000000000000000000..81e0d66a5bf4397f818d1a108e7cb78b76e64708
--- /dev/null
+++ b/LlamaFactory/tests/model/model_utils/test_packing.py
@@ -0,0 +1,68 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+from llamafactory.model.model_utils.packing import get_seqlens_in_batch, get_unpad_data
+
+
+@pytest.mark.parametrize(
+    "attention_mask,golden_seq_lens",
+    [
+        (
+            [
+                [1, 1, 2, 2, 2, 0],
+                [1, 2, 2, 3, 3, 3],
+            ],
+            [2, 3, 1, 2, 3],
+        ),
+        (
+            [[1]],
+            [1],
+        ),
+    ],
+)
+def test_get_seqlens_in_batch(attention_mask, golden_seq_lens):
+    attention_mask_with_indices = torch.tensor(attention_mask)
+    seqlens_in_batch = get_seqlens_in_batch(attention_mask_with_indices)
+    assert torch.all(seqlens_in_batch == torch.tensor(golden_seq_lens))
+
+
+@pytest.mark.parametrize(
+    "attention_mask,golden_indices,golden_cu_seqlens,golden_max_seqlen",
+    [
+        (
+            [
+                [1, 1, 2, 2, 2, 0],
+                [1, 2, 2, 3, 3, 3],
+            ],
+            [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11],
+            [0, 2, 5, 6, 8, 11],
+            3,
+        ),
+        (
+            [[1]],
+            [0],
+            [0, 1],
+            1,
+        ),
+    ],
+)
+def test_get_unpad_data(attention_mask, golden_indices, golden_cu_seqlens, golden_max_seqlen):
+    attention_mask_with_indices = torch.tensor(attention_mask)
+    indices, cu_seqlens, max_seqlen_in_batch = get_unpad_data(attention_mask_with_indices)
+    assert torch.all(indices == torch.tensor(golden_indices))
+    assert torch.all(cu_seqlens == torch.tensor(golden_cu_seqlens, dtype=torch.int32))
+    assert max_seqlen_in_batch == golden_max_seqlen
diff --git a/LlamaFactory/tests/model/model_utils/test_visual.py b/LlamaFactory/tests/model/model_utils/test_visual.py
new file mode 100644
index 0000000000000000000000000000000000000000..b195757222bdb57e09b6ac36a8d661d93a06f565
--- /dev/null
+++ b/LlamaFactory/tests/model/model_utils/test_visual.py
@@ -0,0 +1,104 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+import torch
+from safetensors.torch import load_file
+from transformers import AutoConfig, AutoModelForImageTextToText
+
+from llamafactory.extras.packages import is_transformers_version_greater_than
+from llamafactory.hparams import FinetuningArguments, ModelArguments
+from llamafactory.model.adapter import init_adapter
+
+
+@pytest.mark.parametrize("freeze_vision_tower", (False, True))
+@pytest.mark.parametrize("freeze_multi_modal_projector", (False, True))
+@pytest.mark.parametrize("freeze_language_model", (False, True))
+def test_visual_full(freeze_vision_tower: bool, freeze_multi_modal_projector: bool, freeze_language_model: bool):
+    model_args = ModelArguments(model_name_or_path="Qwen/Qwen2-VL-2B-Instruct")
+    finetuning_args = FinetuningArguments(
+        finetuning_type="full",
+        freeze_vision_tower=freeze_vision_tower,
+        freeze_multi_modal_projector=freeze_multi_modal_projector,
+        freeze_language_model=freeze_language_model,
+    )
+    config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+    with torch.device("meta"):
+        model = AutoModelForImageTextToText.from_config(config)
+
+    model = init_adapter(config, model, model_args, finetuning_args, is_trainable=True)
+    for name, param in model.named_parameters():
+        if any(key in name for key in ["visual.patch_embed", "visual.blocks"]):
+            assert param.requires_grad != freeze_vision_tower
+        elif "visual.merger" in name:
+            assert param.requires_grad != freeze_multi_modal_projector
+        else:
+            assert param.requires_grad != freeze_language_model
+
+
+@pytest.mark.parametrize("freeze_vision_tower,freeze_language_model", ((False, False), (False, True), (True, False)))
+def test_visual_lora(freeze_vision_tower: bool, freeze_language_model: bool):
+    model_args = ModelArguments(model_name_or_path="Qwen/Qwen2-VL-2B-Instruct")
+    finetuning_args = FinetuningArguments(
+        finetuning_type="lora", freeze_vision_tower=freeze_vision_tower, freeze_language_model=freeze_language_model
+    )
+    config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+    with torch.device("meta"):
+        model = AutoModelForImageTextToText.from_config(config)
+
+    model = init_adapter(config, model, model_args, finetuning_args, is_trainable=True)
+    trainable_params, frozen_params = set(), set()
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            trainable_params.add(name)
+        else:
+            frozen_params.add(name)
+
+    if is_transformers_version_greater_than("4.52.0"):
+        visual_param_name = "base_model.model.model.visual.blocks.0.attn.qkv.lora_A.default.weight"
+        language_param_name = "base_model.model.model.language_model.layers.0.self_attn.q_proj.lora_A.default.weight"
+        merger_param_name = "base_model.model.model.visual.merger.lora_A.default.weight"
+    else:
+        visual_param_name = "base_model.model.visual.blocks.0.attn.qkv.lora_A.default.weight"
+        language_param_name = "base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight"
+        merger_param_name = "base_model.model.visual.merger.lora_A.default.weight"
+
+    assert (visual_param_name in trainable_params) != freeze_vision_tower
+    assert (language_param_name in trainable_params) != freeze_language_model
+    assert (merger_param_name in trainable_params) is False
+
+
+def test_visual_model_save_load():
+    # check VLM's state dict: https://github.com/huggingface/transformers/pull/38385
+    model_args = ModelArguments(model_name_or_path="Qwen/Qwen2-VL-2B-Instruct")
+    finetuning_args = FinetuningArguments(finetuning_type="full")
+    config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+    with torch.device("meta"):
+        model = AutoModelForImageTextToText.from_config(config)
+
+    model = init_adapter(config, model, model_args, finetuning_args, is_trainable=False)
+    model.to_empty(device="cpu")
+    loaded_model_weight = dict(model.named_parameters())
+
+    model.save_pretrained(os.path.join("output", "qwen2_vl"), max_shard_size="10GB", safe_serialization=True)
+    saved_model_weight = load_file(os.path.join("output", "qwen2_vl", "model.safetensors"))
+
+    if is_transformers_version_greater_than("4.52.0"):
+        assert "model.language_model.layers.0.self_attn.q_proj.weight" in loaded_model_weight
+    else:
+        assert "model.layers.0.self_attn.q_proj.weight" in loaded_model_weight
+
+    assert "model.layers.0.self_attn.q_proj.weight" in saved_model_weight
diff --git a/LlamaFactory/tests/model/test_base.py b/LlamaFactory/tests/model/test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..14afff633bc6ecd5bd9df1db55ee88f35e5fa78d
--- /dev/null
+++ b/LlamaFactory/tests/model/test_base.py
@@ -0,0 +1,43 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from llamafactory.train.test_utils import compare_model, load_infer_model, load_reference_model
+
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+TINY_LLAMA_VALUEHEAD = os.getenv("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead")
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
+
+
+def test_base():
+    model = load_infer_model(**INFER_ARGS)
+    ref_model = load_reference_model(TINY_LLAMA3)
+    compare_model(model, ref_model)
+
+
+@pytest.mark.usefixtures("fix_valuehead_cpu_loading")
+def test_valuehead():
+    model = load_infer_model(add_valuehead=True, **INFER_ARGS)
+    ref_model = load_reference_model(TINY_LLAMA_VALUEHEAD, add_valuehead=True)
+    compare_model(model, ref_model)
diff --git a/LlamaFactory/tests/model/test_freeze.py b/LlamaFactory/tests/model/test_freeze.py
new file mode 100644
index 0000000000000000000000000000000000000000..b82ec88d5ac39465fbe0862221337d49d270bf79
--- /dev/null
+++ b/LlamaFactory/tests/model/test_freeze.py
@@ -0,0 +1,72 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+
+from llamafactory.train.test_utils import load_infer_model, load_train_model
+
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "freeze",
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "finetuning_type": "freeze",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
+
+
+def test_freeze_train_all_modules():
+    model = load_train_model(freeze_trainable_layers=1, **TRAIN_ARGS)
+    for name, param in model.named_parameters():
+        if name.startswith("model.layers.1."):
+            assert param.requires_grad is True
+            assert param.dtype == torch.float32
+        else:
+            assert param.requires_grad is False
+            assert param.dtype == torch.float16
+
+
+def test_freeze_train_extra_modules():
+    model = load_train_model(freeze_trainable_layers=1, freeze_extra_modules="embed_tokens,lm_head", **TRAIN_ARGS)
+    for name, param in model.named_parameters():
+        if name.startswith("model.layers.1.") or any(module in name for module in ["embed_tokens", "lm_head"]):
+            assert param.requires_grad is True
+            assert param.dtype == torch.float32
+        else:
+            assert param.requires_grad is False
+            assert param.dtype == torch.float16
+
+
+def test_freeze_inference():
+    model = load_infer_model(**INFER_ARGS)
+    for param in model.parameters():
+        assert param.requires_grad is False
+        assert param.dtype == torch.float16
diff --git a/LlamaFactory/tests/model/test_full.py b/LlamaFactory/tests/model/test_full.py
new file mode 100644
index 0000000000000000000000000000000000000000..9058b6acf2a2db5e579fdb4ddd361d087a1fc310
--- /dev/null
+++ b/LlamaFactory/tests/model/test_full.py
@@ -0,0 +1,57 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+
+from llamafactory.train.test_utils import load_infer_model, load_train_model
+
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "full",
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "finetuning_type": "full",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
+
+
+def test_full_train():
+    model = load_train_model(**TRAIN_ARGS)
+    for param in model.parameters():
+        assert param.requires_grad is True
+        assert param.dtype == torch.float32
+
+
+def test_full_inference():
+    model = load_infer_model(**INFER_ARGS)
+    for param in model.parameters():
+        assert param.requires_grad is False
+        assert param.dtype == torch.float16
diff --git a/LlamaFactory/tests/model/test_lora.py b/LlamaFactory/tests/model/test_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..38b6b505d0f2a9dde70e172f3a3023191fcfe9d9
--- /dev/null
+++ b/LlamaFactory/tests/model/test_lora.py
@@ -0,0 +1,103 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+import torch
+
+from llamafactory.train.test_utils import (
+    check_lora_model,
+    compare_model,
+    load_infer_model,
+    load_reference_model,
+    load_train_model,
+)
+
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+TINY_LLAMA_ADAPTER = os.getenv("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora")
+
+TINY_LLAMA_VALUEHEAD = os.getenv("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "lora",
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "adapter_name_or_path": TINY_LLAMA_ADAPTER,
+    "finetuning_type": "lora",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
+
+
+def test_lora_train_qv_modules():
+    model = load_train_model(lora_target="q_proj,v_proj", **TRAIN_ARGS)
+    linear_modules, _ = check_lora_model(model)
+    assert linear_modules == {"q_proj", "v_proj"}
+
+
+def test_lora_train_all_modules():
+    model = load_train_model(lora_target="all", **TRAIN_ARGS)
+    linear_modules, _ = check_lora_model(model)
+    assert linear_modules == {"q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "gate_proj", "down_proj"}
+
+
+def test_lora_train_extra_modules():
+    model = load_train_model(additional_target="embed_tokens,lm_head", **TRAIN_ARGS)
+    _, extra_modules = check_lora_model(model)
+    assert extra_modules == {"embed_tokens", "lm_head"}
+
+
+def test_lora_train_old_adapters():
+    model = load_train_model(adapter_name_or_path=TINY_LLAMA_ADAPTER, create_new_adapter=False, **TRAIN_ARGS)
+    ref_model = load_reference_model(TINY_LLAMA3, TINY_LLAMA_ADAPTER, use_lora=True, is_trainable=True)
+    compare_model(model, ref_model)
+
+
+def test_lora_train_new_adapters():
+    model = load_train_model(adapter_name_or_path=TINY_LLAMA_ADAPTER, create_new_adapter=True, **TRAIN_ARGS)
+    ref_model = load_reference_model(TINY_LLAMA3, TINY_LLAMA_ADAPTER, use_lora=True, is_trainable=True)
+    compare_model(
+        model, ref_model, diff_keys=["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "gate_proj", "down_proj"]
+    )
+
+
+@pytest.mark.usefixtures("fix_valuehead_cpu_loading")
+def test_lora_train_valuehead():
+    model = load_train_model(add_valuehead=True, **TRAIN_ARGS)
+    ref_model = load_reference_model(TINY_LLAMA_VALUEHEAD, is_trainable=True, add_valuehead=True)
+    state_dict = model.state_dict()
+    ref_state_dict = ref_model.state_dict()
+    assert torch.allclose(state_dict["v_head.summary.weight"], ref_state_dict["v_head.summary.weight"])
+    assert torch.allclose(state_dict["v_head.summary.bias"], ref_state_dict["v_head.summary.bias"])
+
+
+def test_lora_inference():
+    model = load_infer_model(**INFER_ARGS)
+    ref_model = load_reference_model(TINY_LLAMA3, TINY_LLAMA_ADAPTER, use_lora=True).merge_and_unload()
+    compare_model(model, ref_model)
diff --git a/LlamaFactory/tests/model/test_pissa.py b/LlamaFactory/tests/model/test_pissa.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b6101f84be30bc99fd085b20bcee6cbc638bc7c
--- /dev/null
+++ b/LlamaFactory/tests/model/test_pissa.py
@@ -0,0 +1,64 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from llamafactory.train.test_utils import compare_model, load_infer_model, load_reference_model, load_train_model
+
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+TINY_LLAMA_PISSA = os.getenv("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-pissa")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "lora",
+    "pissa_init": True,
+    "pissa_iter": -1,
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA_PISSA,
+    "adapter_name_or_path": TINY_LLAMA_PISSA,
+    "adapter_folder": "pissa_init",
+    "finetuning_type": "lora",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
+
+
+@pytest.mark.xfail(reason="PiSSA initialization is not stable in different platform.")
+def test_pissa_train():
+    model = load_train_model(**TRAIN_ARGS)
+    ref_model = load_reference_model(TINY_LLAMA_PISSA, TINY_LLAMA_PISSA, use_pissa=True, is_trainable=True)
+    compare_model(model, ref_model)
+
+
+@pytest.mark.xfail(reason="Known connection error.")
+def test_pissa_inference():
+    model = load_infer_model(**INFER_ARGS)
+    ref_model = load_reference_model(TINY_LLAMA_PISSA, TINY_LLAMA_PISSA, use_pissa=True, is_trainable=False)
+    ref_model = ref_model.merge_and_unload()
+    compare_model(model, ref_model)
diff --git a/LlamaFactory/tests/train/test_sft_trainer.py b/LlamaFactory/tests/train/test_sft_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f6ebe418383309190b735c22dd0d61b42397370
--- /dev/null
+++ b/LlamaFactory/tests/train/test_sft_trainer.py
@@ -0,0 +1,89 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from dataclasses import dataclass, field
+from typing import Any
+
+import pytest
+from transformers import DataCollatorWithPadding
+
+from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
+from llamafactory.hparams import get_train_args
+from llamafactory.model import load_model, load_tokenizer
+from llamafactory.train.sft.trainer import CustomSeq2SeqTrainer
+
+
+DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data")
+
+TINY_LLAMA3 = os.getenv("TINY_LLAMA3", "llamafactory/tiny-random-Llama-3")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA3,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "lora",
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "overwrite_output_dir": True,
+    "per_device_train_batch_size": 1,
+    "max_steps": 1,
+    "report_to": "none",
+}
+
+
+@dataclass
+class DataCollatorWithVerbose(DataCollatorWithPadding):
+    verbose_list: list[dict[str, Any]] = field(default_factory=list)
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
+        features = [
+            {k: v for k, v in feature.items() if k in ["input_ids", "attention_mask", "labels"]}
+            for feature in features
+        ]
+        self.verbose_list.extend(features)
+        batch = super().__call__(features)
+        return {k: v[:, :1] for k, v in batch.items()}  # truncate input length
+
+
+@pytest.mark.parametrize("disable_shuffling", [False, True])
+def test_shuffle(disable_shuffling: bool):
+    model_args, data_args, training_args, finetuning_args, _ = get_train_args(
+        {
+            "output_dir": os.path.join("output", f"shuffle{str(disable_shuffling).lower()}"),
+            "disable_shuffling": disable_shuffling,
+            **TRAIN_ARGS,
+        }
+    )
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="sft", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+    data_collator = DataCollatorWithVerbose(tokenizer=tokenizer)
+    trainer = CustomSeq2SeqTrainer(
+        model=model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        data_collator=data_collator,
+        **dataset_module,
+        **tokenizer_module,
+    )
+    trainer.train()
+    if disable_shuffling:
+        assert data_collator.verbose_list[0]["input_ids"] == dataset_module["train_dataset"][0]["input_ids"]
+    else:
+        assert data_collator.verbose_list[0]["input_ids"] != dataset_module["train_dataset"][0]["input_ids"]
diff --git a/LlamaFactory/tests/version.txt b/LlamaFactory/tests/version.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fdd7d35a4f8f5fde7db833387900c9ab21bfd3bd
--- /dev/null
+++ b/LlamaFactory/tests/version.txt
@@ -0,0 +1,2 @@
+# change if test fails or cache is outdated
+0.9.5.106
diff --git a/LlamaFactory/tests_v1/accelerator/test_interface.py b/LlamaFactory/tests_v1/accelerator/test_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3838f8b1227875a43d49f188166109ac410fbd9
--- /dev/null
+++ b/LlamaFactory/tests_v1/accelerator/test_interface.py
@@ -0,0 +1,67 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+import torch.multiprocessing as mp
+
+from llamafactory.v1.accelerator.helper import ReduceOp
+from llamafactory.v1.accelerator.interface import DistributedInterface
+from llamafactory.v1.utils.env import find_available_port
+from llamafactory.v1.utils.pytest import dist_env
+
+
+def _all_reduce_tests(local_rank: int, world_size: int, master_port: int):
+    with dist_env(local_rank, world_size, master_port):
+        rank = DistributedInterface().get_rank()
+        world_size = DistributedInterface().get_world_size()
+        assert world_size == 2
+
+        y_sum = DistributedInterface().all_reduce(rank + 1.0, op=ReduceOp.SUM)
+        assert y_sum == pytest.approx(3.0)
+
+        y_mean = DistributedInterface().all_reduce(rank + 1.0, op=ReduceOp.MEAN)
+        assert y_mean == pytest.approx(1.5)
+
+        y_max = DistributedInterface().all_reduce(rank + 1.0, op=ReduceOp.MAX)
+        assert y_max == pytest.approx(2.0)
+
+        z = DistributedInterface().all_gather(rank + 1.0)
+        assert z == pytest.approx([1.0, 2.0])
+
+        z = DistributedInterface().broadcast(rank + 1.0)
+        assert z == pytest.approx(1.0)
+
+
+def test_all_device():
+    assert DistributedInterface().get_rank() == int(os.getenv("RANK", "0"))
+    assert DistributedInterface().get_world_size() == int(os.getenv("WORLD_SIZE", "1"))
+    assert DistributedInterface().get_local_rank() == int(os.getenv("LOCAL_RANK", "0"))
+    assert DistributedInterface().get_local_world_size() == int(os.getenv("LOCAL_WORLD_SIZE", "1"))
+
+
+@pytest.mark.runs_on(["cuda", "npu"])
+@pytest.mark.require_distributed(2)
+def test_multi_device():
+    master_port = find_available_port()
+    world_size = 2
+    mp.spawn(_all_reduce_tests, args=(world_size, master_port), nprocs=world_size)
+
+
+if __name__ == "__main__":
+    """
+    python tests_v1/accelerator/test_interface.py
+    """
+    test_all_device()
diff --git a/LlamaFactory/tests_v1/config/test_args_parser.py b/LlamaFactory/tests_v1/config/test_args_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..db235ab54020ff12d74ce974183dbb6c3c186f32
--- /dev/null
+++ b/LlamaFactory/tests_v1/config/test_args_parser.py
@@ -0,0 +1,82 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+from llamafactory.v1.config.arg_parser import get_args
+
+
+def test_get_args_from_yaml(tmp_path: Path):
+    config_yaml = """
+        ### model
+        model: llamafactory/tiny-random-qwen3
+        trust_remote_code: true
+        model_class: llm
+        kernel_config:
+            name: auto
+            include_kernels: auto # choice: null/true/false/auto/kernel_id1,kernel_id2,kernel_id3, default is null
+        peft_config:
+            name: lora
+            lora_rank: 0.8
+        quant_config: null
+
+        ### data
+        train_dataset: llamafactory/v1-sft-demo
+
+        ### training
+        output_dir: outputs/test_run
+        micro_batch_size: 1
+        global_batch_size: 1
+        cutoff_len: 2048
+        learning_rate: 1.0e-4
+        bf16: false
+        dist_config: null
+
+        ### sample
+        sample_backend: hf
+        max_new_tokens: 128
+    """
+
+    config_file = tmp_path / "config.yaml"
+    config_file.write_text(config_yaml, encoding="utf-8")
+
+    test_argv = ["test_args_parser.py", str(config_file)]
+
+    with patch.object(sys, "argv", test_argv):
+        model_args, data_args, training_args, sample_args = get_args()
+        assert data_args.train_dataset == "llamafactory/v1-sft-demo"
+        assert model_args.model == "llamafactory/tiny-random-qwen3"
+        assert model_args.kernel_config.name == "auto"
+        assert model_args.kernel_config.get("include_kernels") == "auto"
+        assert model_args.peft_config.name == "lora"
+        assert model_args.peft_config.get("lora_rank") == 0.8
+        assert training_args.output_dir == "outputs/test_run"
+        assert training_args.micro_batch_size == 1
+        assert training_args.global_batch_size == 1
+        assert training_args.learning_rate == 1.0e-4
+        assert training_args.bf16 is False
+        assert training_args.dist_config is None
+        assert sample_args.sample_backend == "hf"
+
+
+if __name__ == "__main__":
+    """
+    python -m tests_v1.config.test_args_parser
+    """
+    import tempfile
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        test_get_args_from_yaml(tmp_path=Path(tmp_dir))
diff --git a/LlamaFactory/tests_v1/conftest.py b/LlamaFactory/tests_v1/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..adb08d49f798d4b1af7c8bb1019054d2b41ac42e
--- /dev/null
+++ b/LlamaFactory/tests_v1/conftest.py
@@ -0,0 +1,174 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""LlamaFactory test configuration.
+
+Contains shared fixtures, pytest configuration, and custom markers.
+"""
+
+import os
+import sys
+
+import pytest
+import torch
+import torch.distributed as dist
+from pytest import Config, FixtureRequest, Item, MonkeyPatch
+
+from llamafactory.v1.accelerator.helper import get_current_accelerator, get_device_count
+from llamafactory.v1.utils.env import is_env_enabled
+from llamafactory.v1.utils.packages import is_transformers_version_greater_than
+
+
+CURRENT_DEVICE = get_current_accelerator().type
+
+
+def pytest_configure(config: Config):
+    """Register custom pytest markers."""
+    config.addinivalue_line(
+        "markers",
+        "slow: marks tests as slow (deselect with '-m \"not slow\"' or set RUN_SLOW=1 to run)",
+    )
+    config.addinivalue_line(
+        "markers",
+        "runs_on: test requires specific device type, e.g., @pytest.mark.runs_on(['cuda'])",
+    )
+    config.addinivalue_line(
+        "markers",
+        "require_distributed(num_devices): allow multi-device execution (default: 2)",
+    )
+
+
+def _handle_runs_on(items: list[Item]):
+    """Skip tests on specified device TYPES (cpu/cuda/npu)."""
+    for item in items:
+        marker = item.get_closest_marker("runs_on")
+        if not marker:
+            continue
+
+        devices = marker.args[0]
+        if isinstance(devices, str):
+            devices = [devices]
+
+        if CURRENT_DEVICE not in devices:
+            item.add_marker(pytest.mark.skip(reason=f"test requires one of {devices} (current: {CURRENT_DEVICE})"))
+
+
+def _handle_slow_tests(items: list[Item]):
+    """Skip slow tests unless RUN_SLOW is enabled."""
+    if not is_env_enabled("RUN_SLOW"):
+        skip_slow = pytest.mark.skip(reason="slow test (set RUN_SLOW=1 to run)")
+        for item in items:
+            if "slow" in item.keywords:
+                item.add_marker(skip_slow)
+
+
+def _get_visible_devices_env() -> str | None:
+    """Return device visibility env var name."""
+    if CURRENT_DEVICE == "cuda":
+        return "CUDA_VISIBLE_DEVICES"
+    elif CURRENT_DEVICE == "npu":
+        return "ASCEND_RT_VISIBLE_DEVICES"
+    else:
+        return None
+
+
+def _handle_device_visibility(items: list[Item]):
+    """Handle device visibility based on test markers."""
+    env_key = _get_visible_devices_env()
+    if env_key is None or CURRENT_DEVICE in ("cpu", "mps"):
+        return
+
+    # Parse visible devices
+    visible_devices_env = os.environ.get(env_key)
+    if visible_devices_env is None:
+        available = get_device_count()
+    else:
+        visible_devices = [v for v in visible_devices_env.split(",") if v != ""]
+        available = len(visible_devices)
+
+    for item in items:
+        marker = item.get_closest_marker("require_distributed")
+        if not marker:
+            continue
+
+        required = marker.args[0] if marker.args else 2
+        if available < required:
+            item.add_marker(pytest.mark.skip(reason=f"test requires {required} devices, but only {available} visible"))
+
+
+def pytest_collection_modifyitems(config: Config, items: list[Item]):
+    """Modify test collection based on markers and environment."""
+    # Handle version compatibility (from HEAD)
+    skip_bc = pytest.mark.skip(reason="Skip backward compatibility tests")
+    for item in items:
+        if "tests_v1" in str(item.fspath) and not is_transformers_version_greater_than("4.57.0"):
+            item.add_marker(skip_bc)
+
+    _handle_slow_tests(items)
+    _handle_runs_on(items)
+    _handle_device_visibility(items)
+
+
+@pytest.fixture(scope="session", autouse=True)
+def _set_env():
+    # add project root dir to path for mp run
+    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+    if project_root not in sys.path:
+        sys.path.insert(0, project_root)
+
+    os.environ["PYTHONPATH"] = project_root + os.pathsep + os.getenv("PYTHONPATH", "")
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+@pytest.fixture(autouse=True)
+def _cleanup_distributed_state():
+    """Cleanup distributed state after each test."""
+    yield
+    if dist.is_initialized():
+        dist.destroy_process_group()
+
+
+@pytest.fixture(autouse=True)
+def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) -> None:
+    """Set environment variables for distributed tests if specific devices are requested."""
+    env_key = _get_visible_devices_env()
+    if not env_key:
+        return
+
+    # Save old environment for logic checks, monkeypatch handles restoration
+    old_value = os.environ.get(env_key)
+
+    marker = request.node.get_closest_marker("require_distributed")
+    if marker:  # distributed test
+        required = marker.args[0] if marker.args else 2
+        specific_devices = marker.args[1] if len(marker.args) > 1 else None
+
+        if specific_devices:
+            devices_str = ",".join(map(str, specific_devices))
+        else:
+            devices_str = ",".join(str(i) for i in range(required))
+
+        monkeypatch.setenv(env_key, devices_str)
+
+    else:  # non-distributed test
+        if old_value:
+            visible_devices = [v for v in old_value.split(",") if v != ""]
+            monkeypatch.setenv(env_key, visible_devices[0] if visible_devices else "0")
+        else:
+            monkeypatch.setenv(env_key, "0")
+
+        if CURRENT_DEVICE == "cuda":
+            monkeypatch.setattr(torch.cuda, "device_count", lambda: 1)
+        elif CURRENT_DEVICE == "npu":
+            monkeypatch.setattr(torch.npu, "device_count", lambda: 1)
diff --git a/LlamaFactory/tests_v1/core/test_data_engine.py b/LlamaFactory/tests_v1/core/test_data_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..373069a66d2179570f6004caf66aaa29fa07cc9c
--- /dev/null
+++ b/LlamaFactory/tests_v1/core/test_data_engine.py
@@ -0,0 +1,39 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import pytest
+from datasets import load_dataset
+
+from llamafactory.v1.config.data_args import DataArguments
+from llamafactory.v1.core.data_engine import DataEngine
+
+
+@pytest.mark.parametrize("num_samples", [16])
+def test_map_dataset(num_samples: int):
+    data_args = DataArguments(train_dataset="llamafactory/v1-sft-demo")
+    data_engine = DataEngine(data_args.train_dataset)
+    original_data = load_dataset("llamafactory/v1-sft-demo", split="train")
+    indexes = random.choices(range(len(data_engine)), k=num_samples)
+    for index in indexes:
+        print(data_engine[index])
+        assert data_engine[index] == {"_dataset_name": "default", **original_data[index]}
+
+
+if __name__ == "__main__":
+    """
+    python -m tests_v1.core.test_data_engine
+    """
+    test_map_dataset(1)
diff --git a/LlamaFactory/tests_v1/core/test_model_loader.py b/LlamaFactory/tests_v1/core/test_model_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..6228a36995132981416abe960a5c9aa1e9a90c25
--- /dev/null
+++ b/LlamaFactory/tests_v1/core/test_model_loader.py
@@ -0,0 +1,51 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from llamafactory.v1.config.model_args import ModelArguments
+from llamafactory.v1.core.model_engine import ModelEngine
+
+
+def test_tiny_qwen():
+    model_args = ModelArguments(model="llamafactory/tiny-random-qwen3")
+    model_engine = ModelEngine(model_args)
+    assert "Qwen2Tokenizer" in model_engine.processor.__class__.__name__
+    assert "Qwen3Config" in model_engine.model_config.__class__.__name__
+    assert "Qwen3ForCausalLM" in model_engine.model.__class__.__name__
+    assert model_engine.model.dtype == torch.bfloat16
+
+
+def test_tiny_qwen_with_kernel_plugin():
+    from llamafactory.v1.plugins.model_plugins.kernels.ops.rms_norm.npu_rms_norm import npu_rms_norm_forward
+
+    model_args = ModelArguments(
+        model="llamafactory/tiny-random-qwen3", kernel_config={"name": "auto", "include_kernels": "auto"}
+    )
+    model_engine = ModelEngine(model_args)
+    # test enable apply kernel plugin
+    if hasattr(torch, "npu"):
+        assert model_engine.model.model.layers[0].input_layernorm.forward.__code__ == npu_rms_norm_forward.__code__
+    else:
+        assert model_engine.model.model.layers[0].input_layernorm.forward.__code__ != npu_rms_norm_forward.__code__
+
+    assert "Qwen3ForCausalLM" in model_engine.model.__class__.__name__
+
+
+if __name__ == "__main__":
+    """
+    python -m tests_v1.core.test_model_loader
+    """
+    test_tiny_qwen()
+    test_tiny_qwen_with_kernel_plugin()
diff --git a/LlamaFactory/tests_v1/core/utils/test_batching.py b/LlamaFactory/tests_v1/core/utils/test_batching.py
new file mode 100644
index 0000000000000000000000000000000000000000..87e8a89cb7abd4d58cf85ee094e42fff7b4764a0
--- /dev/null
+++ b/LlamaFactory/tests_v1/core/utils/test_batching.py
@@ -0,0 +1,52 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from llamafactory.v1.config import DataArguments, ModelArguments, TrainingArguments
+from llamafactory.v1.core.data_engine import DataEngine
+from llamafactory.v1.core.model_engine import ModelEngine
+from llamafactory.v1.core.utils.batching import BatchGenerator
+
+
+def test_normal_batching():
+    data_args = DataArguments(train_dataset="llamafactory/v1-sft-demo")
+    data_engine = DataEngine(data_args.train_dataset)
+    model_args = ModelArguments(model="llamafactory/tiny-random-qwen3")
+    model_engine = ModelEngine(model_args=model_args)
+    training_args = TrainingArguments(
+        micro_batch_size=4,
+        global_batch_size=8,
+        cutoff_len=10,
+        batching_workers=0,
+        batching_strategy="normal",
+    )
+    batch_generator = BatchGenerator(
+        data_engine,
+        model_engine.renderer,
+        micro_batch_size=training_args.micro_batch_size,
+        global_batch_size=training_args.global_batch_size,
+        cutoff_len=training_args.cutoff_len,
+        batching_workers=training_args.batching_workers,
+        batching_strategy=training_args.batching_strategy,
+    )
+    assert len(batch_generator) == len(data_engine) // training_args.global_batch_size
+    batch = next(iter(batch_generator))
+    assert len(batch) == 2
+    assert batch[0]["input_ids"].shape == (4, 10)
+
+
+if __name__ == "__main__":
+    """
+    python -m tests_v1.core.utils.test_batching
+    """
+    test_normal_batching()
diff --git a/LlamaFactory/tests_v1/core/utils/test_rendering.py b/LlamaFactory/tests_v1/core/utils/test_rendering.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e4797805f586bf39067775a5af83ed047c84529
--- /dev/null
+++ b/LlamaFactory/tests_v1/core/utils/test_rendering.py
@@ -0,0 +1,243 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import pytest
+from transformers import AutoTokenizer
+
+from llamafactory.v1.config import DataArguments
+from llamafactory.v1.core.data_engine import DataEngine
+from llamafactory.v1.core.utils.rendering import Renderer
+from llamafactory.v1.utils.types import Processor
+
+
+def _get_input_ids(inputs: list | dict) -> list:
+    if not isinstance(inputs, list):
+        return inputs["input_ids"]
+    else:
+        return inputs
+
+
+HF_MESSAGES = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "What is LLM?"},
+    {"role": "assistant", "content": "LLM stands for Large Language Model."},
+]
+
+V1_MESSAGES = [
+    {"role": "system", "content": [{"type": "text", "value": "You are a helpful assistant."}]},
+    {"role": "user", "content": [{"type": "text", "value": "What is LLM?"}]},
+    {"role": "assistant", "content": [{"type": "text", "value": "LLM stands for Large Language Model."}]},
+]
+
+HF_MESSAGES_WITH_TOOLS = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "What is 6*8?"},
+    {
+        "role": "assistant",
+        "tool_calls": [{"type": "function", "function": {"name": "multiply", "arguments": {"a": 6, "b": 8}}}],
+    },
+    {"role": "tool", "content": "48."},
+    {"role": "assistant", "content": "The result of 6*8 is 48."},
+]
+
+V1_MESSAGES_WITH_TOOLS = [
+    {"role": "system", "content": [{"type": "text", "value": "You are a helpful assistant."}]},
+    {"role": "user", "content": [{"type": "text", "value": "What is 6*8?"}]},
+    {
+        "role": "assistant",
+        "content": [{"type": "tool_call", "value": json.dumps({"name": "multiply", "arguments": {"a": 6, "b": 8}})}],
+        "loss_weight": 0.0,
+    },
+    {"role": "tool", "content": [{"type": "text", "value": "48."}]},
+    {"role": "assistant", "content": [{"type": "text", "value": "The result of 6*8 is 48."}]},
+]
+
+V1_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "multiply",
+            "description": "A function that multiplies two numbers",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "a": {"type": "number", "description": "The first number to multiply"},
+                    "b": {"type": "number", "description": "The second number to multiply"},
+                },
+                "required": ["a", "b"],
+            },
+        },
+    }
+]
+
+
+def test_chatml_rendering():
+    tokenizer: Processor = AutoTokenizer.from_pretrained("llamafactory/tiny-random-qwen3")
+    renderer = Renderer(template="chatml", processor=tokenizer)
+
+    hf_inputs = _get_input_ids(tokenizer.apply_chat_template(HF_MESSAGES[:-1], add_generation_prompt=True))
+    v1_inputs = renderer.render_messages(V1_MESSAGES[:-1], is_generate=True)
+    assert v1_inputs["input_ids"] == hf_inputs
+    assert v1_inputs["attention_mask"] == [1] * len(hf_inputs)
+    assert v1_inputs["labels"] == [-100] * len(hf_inputs)
+    assert v1_inputs["loss_weights"] == [0.0] * len(hf_inputs)
+
+    hf_inputs_part = _get_input_ids(tokenizer.apply_chat_template(HF_MESSAGES[:-1], add_generation_prompt=False))
+    hf_inputs_full = _get_input_ids(tokenizer.apply_chat_template(HF_MESSAGES, add_generation_prompt=False))
+    v1_inputs_full = renderer.render_messages(V1_MESSAGES, is_generate=False)
+    assert v1_inputs_full["input_ids"] == hf_inputs_full
+    assert v1_inputs_full["attention_mask"] == [1] * len(hf_inputs_full)
+    assert v1_inputs_full["labels"] == [-100] * len(hf_inputs_part) + hf_inputs_full[len(hf_inputs_part) :]
+    assert v1_inputs_full["loss_weights"] == [0.0] * len(hf_inputs_part) + [1.0] * (
+        len(hf_inputs_full) - len(hf_inputs_part)
+    )
+
+
+def test_chatml_parse():
+    tokenizer: Processor = AutoTokenizer.from_pretrained("llamafactory/tiny-random-qwen3")
+    renderer = Renderer(template="chatml", processor=tokenizer)
+    generated_text = "LLM stands for Large Language Model."
+    parsed_message = renderer.parse_message(generated_text)
+    assert parsed_message == V1_MESSAGES[-1]
+
+
+@pytest.mark.parametrize("num_samples", [16])
+def test_chatml_rendering_remote(num_samples: int):
+    tokenizer: Processor = AutoTokenizer.from_pretrained("llamafactory/tiny-random-qwen3")
+    renderer = Renderer(template="chatml", processor=tokenizer)
+    data_args = DataArguments(train_dataset="llamafactory/v1-sft-demo")
+    data_engine = DataEngine(data_args.train_dataset)
+    for index in range(num_samples):
+        v1_inputs = renderer.render_messages(data_engine[index]["messages"], is_generate=True)
+        prefix = tokenizer.encode("<|im_start|>user\n", add_special_tokens=False)
+        print(tokenizer.decode(v1_inputs["input_ids"][: len(prefix)]))
+        assert v1_inputs["input_ids"][: len(prefix)] == prefix
+
+
+def test_qwen3_nothink_rendering():
+    tokenizer: Processor = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Instruct-2507")
+    renderer = Renderer(template="qwen3_nothink", processor=tokenizer)
+
+    hf_inputs = _get_input_ids(
+        tokenizer.apply_chat_template(HF_MESSAGES_WITH_TOOLS[:-1], tools=V1_TOOLS, add_generation_prompt=True)
+    )
+    v1_inputs = renderer.render_messages(V1_MESSAGES_WITH_TOOLS[:-1], tools=json.dumps(V1_TOOLS), is_generate=True)
+    assert v1_inputs["input_ids"] == hf_inputs
+    assert v1_inputs["attention_mask"] == [1] * len(hf_inputs)
+    assert v1_inputs["labels"] == [-100] * len(hf_inputs)
+    assert v1_inputs["loss_weights"] == [0.0] * len(hf_inputs)
+
+    hf_inputs_part = _get_input_ids(
+        tokenizer.apply_chat_template(HF_MESSAGES_WITH_TOOLS[:-1], tools=V1_TOOLS, add_generation_prompt=False)
+    )
+    hf_inputs_full = _get_input_ids(
+        tokenizer.apply_chat_template(HF_MESSAGES_WITH_TOOLS, tools=V1_TOOLS, add_generation_prompt=False)
+    )
+    v1_inputs_full = renderer.render_messages(V1_MESSAGES_WITH_TOOLS, tools=json.dumps(V1_TOOLS), is_generate=False)
+    assert v1_inputs_full["input_ids"] == hf_inputs_full
+    assert v1_inputs_full["attention_mask"] == [1] * len(hf_inputs_full)
+    assert v1_inputs_full["labels"] == [-100] * len(hf_inputs_part) + hf_inputs_full[len(hf_inputs_part) :]
+    assert v1_inputs_full["loss_weights"] == [0.0] * len(hf_inputs_part) + [1.0] * (
+        len(hf_inputs_full) - len(hf_inputs_part)
+    )
+
+
+def test_qwen3_nothink_parse():
+    tokenizer: Processor = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Instruct-2507")
+    renderer = Renderer(template="qwen3_nothink", processor=tokenizer)
+    generated_text = (
+        "<thinking>I need to use the multiply function to calculate 6*8.</thinking>"
+        "Let me call the multiply function."
+        '<tool_call>{"name": "multiply", "arguments": {"a": 6, "b": 8}}</tool_call>'
+    )
+    parsed_message = renderer.parse_message(generated_text)
+    assert parsed_message == {
+        "role": "assistant",
+        "content": [
+            {"type": "reasoning", "value": "I need to use the multiply function to calculate 6*8."},
+            {"type": "text", "value": "Let me call the multiply function."},
+            {"type": "tool_call", "value": json.dumps({"name": "multiply", "arguments": {"a": 6, "b": 8}})},
+        ],
+    }
+
+
+@pytest.mark.parametrize("num_samples", [8])
+def test_qwen3_nothink_rendering_remote(num_samples: int):
+    tokenizer: Processor = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Instruct-2507")
+    renderer = Renderer(template="qwen3_nothink", processor=tokenizer)
+    data_args = DataArguments(train_dataset="llamafactory/reason-tool-use-demo-1500")
+    data_engine = DataEngine(data_args.train_dataset)
+    for index in range(num_samples):
+        v1_inputs = renderer.render_messages(data_engine[index]["messages"], tools=data_engine[index]["tools"])
+        prefix_text = (
+            "<|im_start|>system\nYou are a methodical and expert assistant. "
+            "Your primary goal is to solve user requests by leveraging a set of available tools. "
+            "You must reason for the best course of action in a structured manner before responding.\n\n"
+            "# Tools\n\nYou may call one or more functions to assist with the user query.\n\n"
+            "You are provided with function signatures within <tools></tools> XML tags:\n<tools>\n"
+            '{"type": "function", "function": {"name":'
+        )
+        prefix = tokenizer.encode(prefix_text, add_special_tokens=False)
+        print(tokenizer.decode(v1_inputs["input_ids"][: len(prefix)]))
+        assert v1_inputs["input_ids"][: len(prefix)] == prefix
+
+
+def test_process_sft_samples():
+    tokenizer: Processor = AutoTokenizer.from_pretrained("llamafactory/tiny-random-qwen3")
+    renderer = Renderer(template="chatml", processor=tokenizer)
+    hf_inputs = _get_input_ids(tokenizer.apply_chat_template(HF_MESSAGES))
+
+    samples = [{"messages": V1_MESSAGES, "extra_info": "test", "_dataset_name": "default"}]
+    model_inputs = renderer.process_samples(samples)
+    assert len(model_inputs) == 1
+    assert model_inputs[0]["input_ids"] == hf_inputs
+    assert model_inputs[0]["extra_info"] == "test"
+    assert model_inputs[0]["_dataset_name"] == "default"
+
+
+def test_process_dpo_samples():
+    tokenizer: Processor = AutoTokenizer.from_pretrained("llamafactory/tiny-random-qwen3")
+    renderer = Renderer(template="chatml", processor=tokenizer)
+    hf_inputs = _get_input_ids(tokenizer.apply_chat_template(HF_MESSAGES))
+
+    samples = [
+        {
+            "chosen_messages": V1_MESSAGES,
+            "rejected_messages": V1_MESSAGES,
+            "extra_info": "test",
+            "_dataset_name": "default",
+        }
+    ]
+    model_inputs = renderer.process_samples(samples)
+    assert len(model_inputs) == 1
+    assert model_inputs[0]["input_ids"] == hf_inputs * 2
+    assert model_inputs[0]["token_type_ids"] == [1] * len(hf_inputs) + [2] * len(hf_inputs)
+    assert model_inputs[0]["extra_info"] == "test"
+    assert model_inputs[0]["_dataset_name"] == "default"
+
+
+if __name__ == "__main__":
+    """
+    python -m tests_v1.core.utils.test_rendering
+    """
+    test_chatml_rendering()
+    test_chatml_parse()
+    test_chatml_rendering_remote(16)
+    test_qwen3_nothink_rendering()
+    test_qwen3_nothink_parse()
+    test_qwen3_nothink_rendering_remote(16)
+    test_process_sft_samples()
+    test_process_dpo_samples()
diff --git a/LlamaFactory/tests_v1/plugins/data_plugins/test_converter.py b/LlamaFactory/tests_v1/plugins/data_plugins/test_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..1722b4a67efbdfdb1056c893249531d93995db9e
--- /dev/null
+++ b/LlamaFactory/tests_v1/plugins/data_plugins/test_converter.py
@@ -0,0 +1,128 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import pytest
+from datasets import load_dataset
+
+from llamafactory.v1.config.data_args import DataArguments
+from llamafactory.v1.core.data_engine import DataEngine
+from llamafactory.v1.plugins.data_plugins.converter import DataConverterPlugin
+
+
+@pytest.mark.parametrize("num_samples", [16])
+def test_alpaca_converter(num_samples: int):
+    data_args = DataArguments(train_dataset="llamafactory/v1-dataset-info/tiny-supervised-dataset.yaml")
+    data_engine = DataEngine(data_args.train_dataset)
+    original_data = load_dataset("llamafactory/tiny-supervised-dataset", split="train")
+    indexes = random.choices(range(len(data_engine)), k=num_samples)
+    for index in indexes:
+        print(data_engine[index])
+        expected_data = {
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "value": original_data[index]["instruction"] + original_data[index]["input"]}
+                    ],
+                    "loss_weight": 0.0,
+                },
+                {
+                    "role": "assistant",
+                    "content": [{"type": "text", "value": original_data[index]["output"]}],
+                    "loss_weight": 1.0,
+                },
+            ]
+        }
+        assert data_engine[index] == {"_dataset_name": "tiny_dataset", **expected_data}
+
+
+def test_sharegpt_converter():
+    example = {
+        "conversations": [
+            {"from": "system", "value": "System"},
+            {"from": "human", "value": "User"},
+            {"from": "function_call", "value": "1"},
+            {"from": "observation", "value": "Observation"},
+            {"from": "gpt", "value": "Assistant"},
+        ]
+    }
+    expected_data = {
+        "messages": [
+            {"role": "system", "content": [{"type": "text", "value": "System"}], "loss_weight": 0.0},
+            {"role": "user", "content": [{"type": "text", "value": "User"}], "loss_weight": 0.0},
+            {"role": "assistant", "content": [{"type": "tool_call", "value": "1"}], "loss_weight": 1.0},
+            {"role": "tool", "content": [{"type": "text", "value": "Observation"}], "loss_weight": 0.0},
+            {"role": "assistant", "content": [{"type": "text", "value": "Assistant"}], "loss_weight": 1.0},
+        ]
+    }
+    assert DataConverterPlugin("sharegpt")(example) == expected_data
+
+
+@pytest.mark.parametrize("num_samples", [16])
+def test_pair_converter(num_samples: int):
+    data_args = DataArguments(train_dataset="llamafactory/v1-dataset-info/orca-dpo-pairs.yaml")
+    data_engine = DataEngine(data_args.train_dataset)
+    original_data = load_dataset("HuggingFaceH4/orca_dpo_pairs", split="train_prefs")
+    indexes = random.choices(range(len(data_engine)), k=num_samples)
+    for index in indexes:
+        print(data_engine[index])
+        print(original_data[index])
+        expected_data = {
+            "chosen_messages": [
+                {
+                    "role": "system",
+                    "content": [{"type": "text", "value": original_data[index]["chosen"][0]["content"]}],
+                    "loss_weight": 0.0,
+                },
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "value": original_data[index]["chosen"][1]["content"]}],
+                    "loss_weight": 0.0,
+                },
+                {
+                    "role": "assistant",
+                    "content": [{"type": "text", "value": original_data[index]["chosen"][2]["content"]}],
+                    "loss_weight": 1.0,
+                },
+            ],
+            "rejected_messages": [
+                {
+                    "role": "system",
+                    "content": [{"type": "text", "value": original_data[index]["rejected"][0]["content"]}],
+                    "loss_weight": 0.0,
+                },
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "value": original_data[index]["rejected"][1]["content"]}],
+                    "loss_weight": 0.0,
+                },
+                {
+                    "role": "assistant",
+                    "content": [{"type": "text", "value": original_data[index]["rejected"][2]["content"]}],
+                    "loss_weight": 1.0,
+                },
+            ],
+        }
+        assert data_engine[index] == {"_dataset_name": "tiny_dataset", **expected_data}
+
+
+if __name__ == "__main__":
+    """
+    python -m tests_v1.plugins.data_plugins.test_converter
+    """
+    test_alpaca_converter(1)
+    test_sharegpt_converter()
+    test_pair_converter(1)
diff --git a/LlamaFactory/tests_v1/plugins/model_plugins/test_init_plugin.py b/LlamaFactory/tests_v1/plugins/model_plugins/test_init_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..947f18bd97725c61282a393a1a5310cefd0b9d28
--- /dev/null
+++ b/LlamaFactory/tests_v1/plugins/model_plugins/test_init_plugin.py
@@ -0,0 +1,63 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from llamafactory.v1.accelerator.interface import DistributedInterface
+from llamafactory.v1.config.arg_parser import get_args
+from llamafactory.v1.core.model_engine import ModelEngine
+
+
+def test_init_on_meta():
+    model_args, *_ = get_args(
+        dict(
+            model="llamafactory/tiny-random-qwen3",
+            init_config={"name": "init_on_meta"},
+        )
+    )
+    model_engine = ModelEngine(model_args=model_args)
+    assert model_engine.model.device.type == "meta"
+
+
+def test_init_on_rank0():
+    model_args, *_ = get_args(
+        dict(
+            model="llamafactory/tiny-random-qwen3",
+            init_config={"name": "init_on_rank0"},
+        )
+    )
+    model_engine = ModelEngine(model_args=model_args)
+    if DistributedInterface().get_rank() == 0:
+        assert model_engine.model.device.type == "cpu"
+    else:
+        assert model_engine.model.device.type == "meta"
+
+
+def test_init_on_default():
+    model_args, *_ = get_args(
+        dict(
+            model="llamafactory/tiny-random-qwen3",
+            init_config={"name": "init_on_default"},
+        )
+    )
+    model_engine = ModelEngine(model_args=model_args)
+    assert model_engine.model.device == DistributedInterface().current_device
+
+
+if __name__ == "__main__":
+    """
+    python tests_v1/plugins/model_plugins/test_init_plugin.py
+    """
+    test_init_on_meta()
+    test_init_on_rank0()
+    test_init_on_default()
diff --git a/LlamaFactory/tests_v1/plugins/model_plugins/test_kernel_plugin.py b/LlamaFactory/tests_v1/plugins/model_plugins/test_kernel_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4207fed2af89ffe72befb4e9e2a6cdf4711a72a
--- /dev/null
+++ b/LlamaFactory/tests_v1/plugins/model_plugins/test_kernel_plugin.py
@@ -0,0 +1,73 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from unittest.mock import MagicMock, patch
+
+import torch.multiprocessing as mp
+from transformers import AutoModelForCausalLM
+
+
+def _apply_kernel(rank) -> None:
+    with patch("torch.accelerator.current_accelerator") as mock_get_accelerator:
+        mock_device = MagicMock()
+        setattr(mock_device, "type", "npu")
+        mock_get_accelerator.return_value = mock_device
+
+        # reload kernel modules to respect mocked accelerator
+        for k in list(sys.modules.keys()):
+            if k.startswith("llamafactory.v1.plugins.model_plugins.kernels"):
+                del sys.modules[k]
+
+        from llamafactory.v1.plugins.model_plugins.kernels.interface import apply_default_kernels
+
+        model = AutoModelForCausalLM.from_pretrained("llamafactory/tiny-random-qwen3")
+        original_rmsnorm_forward = model.model.layers[0].input_layernorm.forward
+        original_swiglu_forward = model.model.layers[0].mlp.forward
+
+        model = apply_default_kernels(model=model, include_kernels="npu_fused_rmsnorm")
+
+        assert model.model.layers[0].input_layernorm.forward.__func__ is not original_rmsnorm_forward.__func__
+        assert model.model.layers[0].mlp.forward.__func__ is original_swiglu_forward.__func__
+
+
+def _apply_all_kernels(rank) -> None:
+    with patch("torch.accelerator.current_accelerator") as mock_get_accelerator:
+        mock_device = MagicMock()
+        setattr(mock_device, "type", "npu")
+        mock_get_accelerator.return_value = mock_device
+
+        # reload kernel modules to respect mocked accelerator
+        for k in list(sys.modules.keys()):
+            if k.startswith("llamafactory.v1.plugins.model_plugins.kernels"):
+                del sys.modules[k]
+
+        from llamafactory.v1.plugins.model_plugins.kernels.interface import apply_default_kernels
+
+        model = AutoModelForCausalLM.from_pretrained("llamafactory/tiny-random-qwen3")
+        original_rmsnorm_forward = model.model.layers[0].input_layernorm.forward
+        original_swiglu_forward = model.model.layers[0].mlp.forward
+
+        model = apply_default_kernels(model=model, include_kernels=True)
+
+        assert model.model.layers[0].input_layernorm.forward.__func__ is not original_rmsnorm_forward.__func__
+        assert model.model.layers[0].mlp.forward.__func__ is not original_swiglu_forward.__func__
+
+
+def test_apply_kernel():
+    mp.spawn(_apply_kernel)
+
+
+def test_apply_all_kernels():
+    mp.spawn(_apply_all_kernels)
diff --git a/LlamaFactory/tests_v1/sampler/test_cli_sampler.py b/LlamaFactory/tests_v1/sampler/test_cli_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f858e1f9c4a58b495bbd2441205d6f14ab6ecf2
--- /dev/null
+++ b/LlamaFactory/tests_v1/sampler/test_cli_sampler.py
@@ -0,0 +1,44 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from llamafactory.v1.config import ModelArguments, SampleArguments
+from llamafactory.v1.core.model_engine import ModelEngine
+from llamafactory.v1.samplers.cli_sampler import SyncSampler
+
+
+@pytest.mark.runs_on(["cuda", "npu"])
+def test_sync_sampler():
+    model_args = ModelArguments(model="Qwen/Qwen3-4B-Instruct-2507", template="qwen3_nothink")
+    sample_args = SampleArguments()
+    model_engine = ModelEngine(model_args)
+    sampler = SyncSampler(sample_args, model_args, model_engine.model, model_engine.renderer)
+    messages = [{"role": "user", "content": [{"type": "text", "value": "Say 'This is a test.'"}]}]
+    response = ""
+    for new_text in sampler.generate(messages):
+        response += new_text
+
+    print(response)
+    assert model_engine.renderer.parse_message(response) == {
+        "role": "assistant",
+        "content": [{"type": "text", "value": "This is a test."}],
+    }
+
+
+if __name__ == "__main__":
+    """
+    python tests_v1/sampler/test_cli_sampler.py
+    """
+    test_sync_sampler()
diff --git a/LlamaFactory/tests_v1/trainers/test_fsdp2_sft_trainer.py b/LlamaFactory/tests_v1/trainers/test_fsdp2_sft_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..875f557858a1468dda612e4e04cb8f1c4d2f52d1
--- /dev/null
+++ b/LlamaFactory/tests_v1/trainers/test_fsdp2_sft_trainer.py
@@ -0,0 +1,89 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+
+@pytest.mark.xfail(reason="CI machines may OOM when heavily loaded.")
+@pytest.mark.runs_on(["cuda", "npu"])
+def test_fsdp2_sft_trainer(tmp_path: Path):
+    """Test FSDP2 SFT trainer by simulating `llamafactory-cli sft config.yaml` behavior."""
+    config_yaml = """\
+model: Qwen/Qwen3-0.6B
+trust_remote_code: true
+model_class: llm
+
+template: qwen3_nothink
+
+kernel_config:
+    name: auto
+    include_kernels: auto
+
+quant_config: null
+
+dist_config:
+    name: fsdp2
+    dcp_path: null
+
+init_config:
+    name: init_on_meta
+
+### data
+train_dataset: data/v1_sft_demo.yaml
+
+### training
+output_dir: {output_dir}
+micro_batch_size: 1
+global_batch_size: 1
+cutoff_len: 2048
+learning_rate: 1.0e-4
+bf16: false
+max_steps: 1
+
+### sample
+sample_backend: hf
+max_new_tokens: 128
+"""
+    # Create output directory
+    output_dir = tmp_path / "outputs"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    config_file = tmp_path / "config.yaml"
+    config_file.write_text(config_yaml.format(output_dir=str(output_dir)))
+
+    # Set up environment variables
+    env = os.environ.copy()
+    env["USE_V1"] = "1"  # Use v1 launcher
+    env["FORCE_TORCHRUN"] = "1"  # Force distributed training via torchrun
+
+    # Run the CLI command via subprocess
+    # This simulates: llamafactory-cli sft config.yaml
+    result = subprocess.run(
+        [sys.executable, "-m", "llamafactory.cli", "sft", str(config_file)],
+        env=env,
+        capture_output=True,
+        cwd=str(Path(__file__).parent.parent.parent),  # LLaMA-Factory root
+    )
+
+    # Decode output with error handling (progress bars may contain non-UTF-8 bytes)
+    stderr = result.stderr.decode("utf-8", errors="replace")
+
+    # Check the result
+    assert result.returncode == 0, f"Training failed with return code {result.returncode}\nSTDERR: {stderr}"
+
+    # Verify output files exist (optional - adjust based on what run_sft produces)
+    # assert (output_dir / "some_expected_file").exists()
diff --git a/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/config.yaml b/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ad9c49bd6b4db9ca34923855691c4fe1ec14c133
--- /dev/null
+++ b/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/config.yaml
@@ -0,0 +1,723 @@
+_name_or_path:
+    value: /workspace/Qwen/Qwen3-8B-Base
+_wandb:
+    value:
+        cli_version: 0.24.1
+        e:
+            w3ltcjzxbduoqrmb4bdrwr1550cgr9rj:
+                args:
+                    - /workspace/v127rc_exp1/B.yaml
+                cpu_count: 24
+                cpu_count_logical: 48
+                cudaVersion: "12.8"
+                disk:
+                    /:
+                        total: "21474836480"
+                        used: "1931444224"
+                email: markmochi200@gmail.com
+                executable: /usr/bin/python
+                git:
+                    commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
+                    remote: https://github.com/hiyouga/LlamaFactory.git
+                gpu: NVIDIA GeForce RTX 4090
+                gpu_count: 1
+                gpu_nvidia:
+                    - architecture: Ada
+                      cudaCores: 16384
+                      memoryTotal: "25757220864"
+                      name: NVIDIA GeForce RTX 4090
+                      uuid: GPU-acb5171c-45e7-5653-1120-9d0cd2a192a6
+                host: 34f54978776c
+                memory:
+                    total: "405012275200"
+                os: Linux-6.8.0-90-generic-x86_64-with-glibc2.35
+                program: /usr/local/bin/llamafactory-cli
+                python: CPython 3.11.10
+                root: /workspace/LlamaFactory
+                startedAt: "2026-02-04T03:49:47.693011Z"
+                writerId: w3ltcjzxbduoqrmb4bdrwr1550cgr9rj
+        m:
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "2": '*'
+              "5": 1
+              "6":
+                - 1
+              "7": []
+        python_version: 3.11.10
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "3":
+                - 7
+                - 19
+                - 62
+                - 66
+            "4": 3.11.10
+            "5": 0.24.1
+            "6": 5.0.0
+            "9":
+                "1": transformers_trainer
+            "12": 0.24.1
+            "13": linux-x86_64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.95
+adam_epsilon:
+    value: 1e-08
+architectures:
+    value:
+        - Qwen3ForCausalLM
+attention_bias:
+    value: false
+attention_dropout:
+    value: 0
+auto_find_batch_size:
+    value: false
+average_tokens_across_devices:
+    value: true
+batch_eval_metrics:
+    value: false
+bf16:
+    value: true
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: null
+chunk_size_feed_forward:
+    value: 0
+data_args:
+    value:
+        buffer_size: 16384
+        cutoff_len: 2047
+        data_shared_file_system: false
+        dataset:
+            - Markie_Voss_t35_d0_r286
+        dataset_dir: /workspace/LlamaFactory/data
+        default_system: null
+        enable_thinking: false
+        eval_dataset: null
+        eval_num_beams: null
+        eval_on_each_dataset: false
+        ignore_pad_token_for_loss: true
+        interleave_probs: null
+        mask_history: false
+        max_samples: 100000000
+        media_dir: /workspace/LlamaFactory/data
+        mix_strategy: concat
+        neat_packing: false
+        overwrite_cache: false
+        packing: true
+        preprocessing_batch_size: 1000
+        preprocessing_num_workers: 16
+        streaming: false
+        template: qwen3_nothink
+        tokenized_path: null
+        tool_format: null
+        train_on_prompt: false
+        val_size: 0
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 180000000
+debug:
+    value: []
+deepspeed:
+    value: null
+disable_tqdm:
+    value: false
+do_eval:
+    value: false
+do_predict:
+    value: false
+do_train:
+    value: true
+dtype:
+    value: bfloat16
+enable_jit_checkpoint:
+    value: false
+eos_token_id:
+    value: 151645
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: null
+eval_strategy:
+    value: "no"
+eval_use_gather_object:
+    value: false
+finetuning_args:
+    value:
+        additional_target: null
+        apollo_layerwise: false
+        apollo_proj: random
+        apollo_proj_type: std
+        apollo_rank: 16
+        apollo_scale: 32
+        apollo_scale_front: false
+        apollo_scale_type: channel
+        apollo_target:
+            - all
+        apollo_update_interval: 200
+        badam_mask_mode: adjacent
+        badam_mode: layer
+        badam_start_block: null
+        badam_switch_interval: 50
+        badam_switch_mode: ascending
+        badam_update_ratio: 0.05
+        badam_verbose: 0
+        compute_accuracy: false
+        create_new_adapter: false
+        disable_shuffling: false
+        dpo_label_smoothing: 0
+        eaft_alpha: 1
+        early_stopping_steps: null
+        finetuning_type: lora
+        freeze_extra_modules: null
+        freeze_language_model: false
+        freeze_multi_modal_projector: true
+        freeze_trainable_layers: 2
+        freeze_trainable_modules:
+            - all
+        freeze_vision_tower: true
+        galore_layerwise: false
+        galore_proj_type: std
+        galore_rank: 16
+        galore_scale: 2
+        galore_target:
+            - all
+        galore_update_interval: 200
+        include_effective_tokens_per_second: false
+        kto_chosen_weight: 1
+        kto_rejected_weight: 1
+        ld_alpha: null
+        lora_alpha: 32
+        lora_dropout: 0.03
+        lora_rank: 16
+        lora_target:
+            - all
+        loraplus_lr_embedding: 1e-06
+        loraplus_lr_ratio: null
+        module_dropout: 0
+        oft_block_size: 32
+        oft_rank: 0
+        oft_target:
+            - all
+        pissa_convert: false
+        pissa_init: false
+        pissa_iter: 16
+        plot_loss: true
+        ppo_buffer_size: 1
+        ppo_epochs: 4
+        ppo_score_norm: false
+        ppo_target: 6
+        ppo_whiten_rewards: false
+        pref_bco_weight: 0
+        pref_beta: 0.1
+        pref_ftx: 0
+        pref_loss: sigmoid
+        pure_bf16: false
+        ref_model: null
+        ref_model_adapters: null
+        ref_model_quantization_bit: null
+        reward_model: null
+        reward_model_adapters: null
+        reward_model_quantization_bit: null
+        reward_model_type: lora
+        simpo_gamma: 0.5
+        stage: pt
+        swanlab_api_key: <SWANLAB_API_KEY>
+        swanlab_lark_secret: null
+        swanlab_lark_webhook_url: null
+        swanlab_logdir: null
+        swanlab_mode: cloud
+        swanlab_project: llamafactory
+        swanlab_run_name: null
+        swanlab_workspace: null
+        use_adam_mini: false
+        use_apollo: false
+        use_badam: false
+        use_dft_loss: false
+        use_dora: false
+        use_eaft_loss: false
+        use_galore: false
+        use_llama_pro: false
+        use_mca: false
+        use_muon: false
+        use_rslora: false
+        use_swanlab: false
+fp8:
+    value: false
+fp8_backend:
+    value: auto
+fp8_enable_fsdp_float8_all_gather:
+    value: false
+fp16:
+    value: false
+fp16_full_eval:
+    value: false
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+full_determinism:
+    value: false
+generating_args:
+    value:
+        do_sample: true
+        length_penalty: 1
+        max_new_tokens: 1024
+        num_beams: 1
+        repetition_penalty: 1
+        skip_special_tokens: true
+        temperature: 0.95
+        top_k: 50
+        top_p: 0.7
+generation_config:
+    value: null
+generation_max_length:
+    value: 2047
+generation_num_beams:
+    value: null
+gradient_accumulation_steps:
+    value: 1
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: null
+group_by_length:
+    value: false
+head_dim:
+    value: 128
+hidden_act:
+    value: silu
+hidden_size:
+    value: 4096
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: null
+hub_revision:
+    value: null
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_for_metrics:
+    value: []
+include_num_input_tokens_seen:
+    value: all
+initializer_range:
+    value: 0.02
+intermediate_size:
+    value: 12288
+is_encoder_decoder:
+    value: false
+label_names:
+    value:
+        - labels
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+layer_types:
+    value:
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+learning_rate:
+    value: 5e-05
+length_column_name:
+    value: length
+liger_kernel_config:
+    value: null
+load_best_model_at_end:
+    value: false
+local_rank:
+    value: -1
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: null
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 1
+logging_strategy:
+    value: steps
+lr_scheduler_kwargs:
+    value: null
+lr_scheduler_type:
+    value: cosine
+master_addr:
+    value: null
+master_port:
+    value: null
+max_grad_norm:
+    value: 1
+max_position_embeddings:
+    value: 32768
+max_steps:
+    value: -1
+max_window_layers:
+    value: 36
+metric_for_best_model:
+    value: null
+model/num_parameters:
+    value: 8234382336
+model_args:
+    value:
+        adapter_folder: null
+        adapter_name_or_path: null
+        add_special_tokens: null
+        add_tokens: null
+        audio_sampling_rate: 16000
+        block_diag_attn: false
+        cache_dir: null
+        chunk_size: 8192
+        compute_dtype: torch.bfloat16
+        cpu_infer: 32
+        crop_to_patches: false
+        device_map:
+            "": cuda:0
+        disable_gradient_checkpointing: false
+        double_quantization: true
+        enable_liger_kernel: false
+        export_device: cpu
+        export_dir: null
+        export_hub_model_id: null
+        export_legacy_format: false
+        export_quantization_bit: null
+        export_quantization_dataset: null
+        export_quantization_maxlen: 1024
+        export_quantization_nsamples: 128
+        export_size: 5
+        flash_attn: auto
+        hf_hub_token: <HF_HUB_TOKEN>
+        image_do_pan_and_scan: false
+        image_max_pixels: 589824
+        image_min_pixels: 1024
+        infer_backend: HF
+        infer_dtype: auto
+        init_special_tokens: noise_init
+        kt_force_think: false
+        kt_maxlen: 4096
+        kt_mode: normal
+        kt_optimize_rule: null
+        kt_use_cuda_graph: true
+        low_cpu_mem_usage: true
+        mixture_of_depths: null
+        mode: normal
+        model_max_length: 2047
+        model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+        model_revision: main
+        moe_aux_loss_coef: null
+        ms_hub_token: <MS_HUB_TOKEN>
+        new_special_tokens_config: null
+        offload_folder: offload
+        om_hub_token: <OM_HUB_TOKEN>
+        print_param_status: false
+        quantization_bit: null
+        quantization_device_map: null
+        quantization_method: BNB
+        quantization_type: nf4
+        resize_vocab: false
+        rope_scaling: null
+        sglang_config: null
+        sglang_lora_backend: triton
+        sglang_maxlen: 4096
+        sglang_mem_fraction: 0.7
+        sglang_tp_size: -1
+        shift_attn: false
+        split_special_tokens: false
+        train_from_scratch: false
+        trust_remote_code: true
+        upcast_layernorm: false
+        upcast_lmhead_output: false
+        use_audio_in_video: false
+        use_fast_tokenizer: true
+        use_kt: false
+        use_kv_cache: true
+        use_reentrant_gc: true
+        use_unsloth: false
+        use_unsloth_gc: false
+        use_v1_kernels: false
+        video_fps: 2
+        video_max_pixels: 65536
+        video_maxlen: 128
+        video_min_pixels: 256
+        vllm_config: null
+        vllm_enforce_eager: false
+        vllm_gpu_util: 0.7
+        vllm_max_lora_rank: 32
+        vllm_maxlen: 4096
+model_type:
+    value: qwen3
+neftune_noise_alpha:
+    value: null
+num_attention_heads:
+    value: 32
+num_hidden_layers:
+    value: 36
+num_key_value_heads:
+    value: 8
+num_train_epochs:
+    value: 5
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: /workspace/v127rc_exp1/B
+output_hidden_states:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: 151643
+parallelism_config:
+    value: null
+peft_config:
+    value:
+        default:
+            alora_invocation_tokens: null
+            arrow_config: null
+            auto_mapping: null
+            base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+            bias: none
+            corda_config: null
+            ensure_weight_tying: false
+            eva_config: null
+            exclude_modules: null
+            fan_in_fan_out: false
+            inference_mode: false
+            init_lora_weights: true
+            layer_replication: null
+            layers_pattern: null
+            layers_to_transform: null
+            lora_alpha: 32
+            lora_bias: false
+            lora_dropout: 0.03
+            megatron_config: null
+            megatron_core: megatron.core
+            modules_to_save: null
+            peft_type: LORA
+            peft_version: 0.18.1
+            qalora_group_size: 16
+            r: 16
+            revision: null
+            runtime_config:
+                ephemeral_gpu_offload: false
+            target_modules:
+                - down_proj
+                - k_proj
+                - v_proj
+                - q_proj
+                - gate_proj
+                - up_proj
+                - o_proj
+            target_parameters: null
+            task_type: CAUSAL_LM
+            trainable_token_indices: null
+            use_dora: false
+            use_qalora: false
+            use_rslora: false
+per_device_eval_batch_size:
+    value: 8
+per_device_train_batch_size:
+    value: 1
+predict_with_generate:
+    value: false
+prediction_loss_only:
+    value: false
+problem_type:
+    value: null
+project:
+    value: huggingface
+push_to_hub:
+    value: false
+ray_init_kwargs:
+    value: null
+ray_num_workers:
+    value: 1
+remove_unused_columns:
+    value: false
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+rms_norm_eps:
+    value: 1e-06
+rope_parameters:
+    value:
+        rope_theta: 1000000
+        rope_type: default
+run_name:
+    value: null
+save_on_each_node:
+    value: false
+save_only_model:
+    value: true
+save_steps:
+    value: 1000
+save_strategy:
+    value: steps
+save_total_limit:
+    value: null
+seed:
+    value: 42
+skip_memory_metrics:
+    value: true
+sliding_window:
+    value: null
+sortish_sampler:
+    value: false
+tf32:
+    value: null
+tie_word_embeddings:
+    value: false
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_empty_cache_steps:
+    value: null
+trackio_space_id:
+    value: trackio
+transformers_version:
+    value: 5.0.0
+use_cache:
+    value: false
+use_cpu:
+    value: false
+use_liger_kernel:
+    value: false
+use_sliding_window:
+    value: false
+vocab_size:
+    value: 151936
+warmup_ratio:
+    value: 0.02
+warmup_steps:
+    value: 0.02
+weight_decay:
+    value: 0
diff --git a/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/output.log b/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..ec10ebf79030161b972dbda00c2a35a0f82f2af1
--- /dev/null
+++ b/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/output.log
@@ -0,0 +1,781 @@
+  0%|                                                                                                                                                                                       | 0/40950 [00:00<?, ?it/s]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+                                                                                                                                                                                                                      
+{'loss': '1.719', 'grad_norm': '0.3139', 'learning_rate': '0', 'epoch': '0.0001221', 'num_input_tokens_seen': 2047, 'train_runtime': '3.131', 'train_tokens_per_second': '653.7'}
+{'loss': '1.142', 'grad_norm': '0.2725', 'learning_rate': '6.105e-08', 'epoch': '0.0002442', 'num_input_tokens_seen': 4094, 'train_runtime': '4.163', 'train_tokens_per_second': '983.5'}
+{'loss': '1.388', 'grad_norm': '0.377', 'learning_rate': '1.221e-07', 'epoch': '0.0003663', 'num_input_tokens_seen': 6141, 'train_runtime': '5.192', 'train_tokens_per_second': '1183'}
+{'loss': '1.458', 'grad_norm': '0.2883', 'learning_rate': '1.832e-07', 'epoch': '0.0004884', 'num_input_tokens_seen': 8188, 'train_runtime': '6.223', 'train_tokens_per_second': '1316'}
+{'loss': '1.288', 'grad_norm': '0.2511', 'learning_rate': '2.442e-07', 'epoch': '0.0006105', 'num_input_tokens_seen': 10235, 'train_runtime': '7.255', 'train_tokens_per_second': '1411'}
+{'loss': '0.01254', 'grad_norm': '0.042', 'learning_rate': '3.053e-07', 'epoch': '0.0007326', 'num_input_tokens_seen': 12282, 'train_runtime': '8.283', 'train_tokens_per_second': '1483'}
+{'loss': '0.8576', 'grad_norm': '0.2707', 'learning_rate': '3.663e-07', 'epoch': '0.0008547', 'num_input_tokens_seen': 14329, 'train_runtime': '9.313', 'train_tokens_per_second': '1539'}
+{'loss': '1.581', 'grad_norm': '0.2901', 'learning_rate': '4.274e-07', 'epoch': '0.0009768', 'num_input_tokens_seen': 16376, 'train_runtime': '10.34', 'train_tokens_per_second': '1583'}
+{'loss': '1.571', 'grad_norm': '0.2915', 'learning_rate': '4.884e-07', 'epoch': '0.001099', 'num_input_tokens_seen': 18423, 'train_runtime': '11.4', 'train_tokens_per_second': '1617'}
+{'loss': '1.346', 'grad_norm': '0.2843', 'learning_rate': '5.495e-07', 'epoch': '0.001221', 'num_input_tokens_seen': 20470, 'train_runtime': '12.43', 'train_tokens_per_second': '1647'}
+{'loss': '1.65', 'grad_norm': '0.4536', 'learning_rate': '6.105e-07', 'epoch': '0.001343', 'num_input_tokens_seen': 22517, 'train_runtime': '13.46', 'train_tokens_per_second': '1673'}
+{'loss': '1.488', 'grad_norm': '0.3475', 'learning_rate': '6.716e-07', 'epoch': '0.001465', 'num_input_tokens_seen': 24564, 'train_runtime': '14.5', 'train_tokens_per_second': '1694'}
+{'loss': '0.8101', 'grad_norm': '0.2232', 'learning_rate': '7.326e-07', 'epoch': '0.001587', 'num_input_tokens_seen': 26611, 'train_runtime': '15.54', 'train_tokens_per_second': '1713'}
+{'loss': '0.5654', 'grad_norm': '0.2165', 'learning_rate': '7.937e-07', 'epoch': '0.001709', 'num_input_tokens_seen': 28658, 'train_runtime': '16.57', 'train_tokens_per_second': '1730'}
+{'loss': '1.623', 'grad_norm': '0.3257', 'learning_rate': '8.547e-07', 'epoch': '0.001832', 'num_input_tokens_seen': 30705, 'train_runtime': '17.6', 'train_tokens_per_second': '1745'}
+{'loss': '1.419', 'grad_norm': '0.2852', 'learning_rate': '9.158e-07', 'epoch': '0.001954', 'num_input_tokens_seen': 32752, 'train_runtime': '18.64', 'train_tokens_per_second': '1757'}
+{'loss': '1.691', 'grad_norm': '0.3216', 'learning_rate': '9.768e-07', 'epoch': '0.002076', 'num_input_tokens_seen': 34799, 'train_runtime': '19.67', 'train_tokens_per_second': '1769'}
+{'loss': '1.732', 'grad_norm': '0.3083', 'learning_rate': '1.038e-06', 'epoch': '0.002198', 'num_input_tokens_seen': 36846, 'train_runtime': '20.71', 'train_tokens_per_second': '1780'}
+{'loss': '1.509', 'grad_norm': '0.3115', 'learning_rate': '1.099e-06', 'epoch': '0.00232', 'num_input_tokens_seen': 38893, 'train_runtime': '21.74', 'train_tokens_per_second': '1789'}
+{'loss': '1.435', 'grad_norm': '0.3078', 'learning_rate': '1.16e-06', 'epoch': '0.002442', 'num_input_tokens_seen': 40940, 'train_runtime': '22.77', 'train_tokens_per_second': '1798'}
+{'loss': '1.786', 'grad_norm': '0.4448', 'learning_rate': '1.221e-06', 'epoch': '0.002564', 'num_input_tokens_seen': 42987, 'train_runtime': '23.81', 'train_tokens_per_second': '1806'}
+{'loss': '1.102', 'grad_norm': '0.3418', 'learning_rate': '1.282e-06', 'epoch': '0.002686', 'num_input_tokens_seen': 45034, 'train_runtime': '24.84', 'train_tokens_per_second': '1813'}
+{'loss': '0.7681', 'grad_norm': '0.2483', 'learning_rate': '1.343e-06', 'epoch': '0.002808', 'num_input_tokens_seen': 47081, 'train_runtime': '25.88', 'train_tokens_per_second': '1819'}
+{'loss': '0.9436', 'grad_norm': '0.2256', 'learning_rate': '1.404e-06', 'epoch': '0.00293', 'num_input_tokens_seen': 49128, 'train_runtime': '26.91', 'train_tokens_per_second': '1825'}
+{'loss': '1.33', 'grad_norm': '0.4983', 'learning_rate': '1.465e-06', 'epoch': '0.003053', 'num_input_tokens_seen': 51175, 'train_runtime': '27.95', 'train_tokens_per_second': '1831'}
+{'loss': '1.596', 'grad_norm': '0.3407', 'learning_rate': '1.526e-06', 'epoch': '0.003175', 'num_input_tokens_seen': 53222, 'train_runtime': '28.98', 'train_tokens_per_second': '1836'}
+{'loss': '1.795', 'grad_norm': '0.3412', 'learning_rate': '1.587e-06', 'epoch': '0.003297', 'num_input_tokens_seen': 55269, 'train_runtime': '30.01', 'train_tokens_per_second': '1841'}
+{'loss': '0.7545', 'grad_norm': '0.208', 'learning_rate': '1.648e-06', 'epoch': '0.003419', 'num_input_tokens_seen': 57316, 'train_runtime': '31.05', 'train_tokens_per_second': '1846'}
+{'loss': '0.6681', 'grad_norm': '0.2179', 'learning_rate': '1.709e-06', 'epoch': '0.003541', 'num_input_tokens_seen': 59363, 'train_runtime': '32.08', 'train_tokens_per_second': '1850'}
+{'loss': '1', 'grad_norm': '0.2357', 'learning_rate': '1.77e-06', 'epoch': '0.003663', 'num_input_tokens_seen': 61410, 'train_runtime': '33.11', 'train_tokens_per_second': '1854'}
+{'loss': '1.189', 'grad_norm': '0.2451', 'learning_rate': '1.832e-06', 'epoch': '0.003785', 'num_input_tokens_seen': 63457, 'train_runtime': '34.15', 'train_tokens_per_second': '1858'}
+{'loss': '1.355', 'grad_norm': '0.2563', 'learning_rate': '1.893e-06', 'epoch': '0.003907', 'num_input_tokens_seen': 65504, 'train_runtime': '35.18', 'train_tokens_per_second': '1862'}
+{'loss': '1.41', 'grad_norm': '0.3257', 'learning_rate': '1.954e-06', 'epoch': '0.004029', 'num_input_tokens_seen': 67551, 'train_runtime': '36.22', 'train_tokens_per_second': '1865'}
+{'loss': '1.575', 'grad_norm': '0.3044', 'learning_rate': '2.015e-06', 'epoch': '0.004151', 'num_input_tokens_seen': 69598, 'train_runtime': '37.25', 'train_tokens_per_second': '1868'}
+{'loss': '1.541', 'grad_norm': '0.3241', 'learning_rate': '2.076e-06', 'epoch': '0.004274', 'num_input_tokens_seen': 71645, 'train_runtime': '38.29', 'train_tokens_per_second': '1871'}
+{'loss': '1.28', 'grad_norm': '0.3421', 'learning_rate': '2.137e-06', 'epoch': '0.004396', 'num_input_tokens_seen': 73692, 'train_runtime': '39.33', 'train_tokens_per_second': '1873'}
+{'loss': '1.935', 'grad_norm': '0.5964', 'learning_rate': '2.198e-06', 'epoch': '0.004518', 'num_input_tokens_seen': 75739, 'train_runtime': '40.37', 'train_tokens_per_second': '1876'}
+{'loss': '1.856', 'grad_norm': '0.2597', 'learning_rate': '2.259e-06', 'epoch': '0.00464', 'num_input_tokens_seen': 77786, 'train_runtime': '41.41', 'train_tokens_per_second': '1878'}
+{'loss': '0.8786', 'grad_norm': '0.2991', 'learning_rate': '2.32e-06', 'epoch': '0.004762', 'num_input_tokens_seen': 79833, 'train_runtime': '42.45', 'train_tokens_per_second': '1881'}
+{'loss': '1.273', 'grad_norm': '0.2598', 'learning_rate': '2.381e-06', 'epoch': '0.004884', 'num_input_tokens_seen': 81880, 'train_runtime': '43.49', 'train_tokens_per_second': '1883'}
+{'loss': '1.501', 'grad_norm': '0.3149', 'learning_rate': '2.442e-06', 'epoch': '0.005006', 'num_input_tokens_seen': 83927, 'train_runtime': '44.52', 'train_tokens_per_second': '1885'}
+{'loss': '1.367', 'grad_norm': '0.263', 'learning_rate': '2.503e-06', 'epoch': '0.005128', 'num_input_tokens_seen': 85974, 'train_runtime': '45.56', 'train_tokens_per_second': '1887'}
+{'loss': '0.7327', 'grad_norm': '0.226', 'learning_rate': '2.564e-06', 'epoch': '0.00525', 'num_input_tokens_seen': 88021, 'train_runtime': '46.6', 'train_tokens_per_second': '1889'}
+{'loss': '1.2', 'grad_norm': '0.2778', 'learning_rate': '2.625e-06', 'epoch': '0.005372', 'num_input_tokens_seen': 90068, 'train_runtime': '47.64', 'train_tokens_per_second': '1891'}
+{'loss': '1.659', 'grad_norm': '0.3285', 'learning_rate': '2.686e-06', 'epoch': '0.005495', 'num_input_tokens_seen': 92115, 'train_runtime': '48.68', 'train_tokens_per_second': '1892'}
+{'loss': '1.7', 'grad_norm': '0.3473', 'learning_rate': '2.747e-06', 'epoch': '0.005617', 'num_input_tokens_seen': 94162, 'train_runtime': '49.71', 'train_tokens_per_second': '1894'}
+{'loss': '1.514', 'grad_norm': '0.3474', 'learning_rate': '2.808e-06', 'epoch': '0.005739', 'num_input_tokens_seen': 96209, 'train_runtime': '50.75', 'train_tokens_per_second': '1896'}
+{'loss': '1.736', 'grad_norm': '0.3107', 'learning_rate': '2.869e-06', 'epoch': '0.005861', 'num_input_tokens_seen': 98256, 'train_runtime': '51.79', 'train_tokens_per_second': '1897'}
+{'loss': '1.359', 'grad_norm': '0.3283', 'learning_rate': '2.93e-06', 'epoch': '0.005983', 'num_input_tokens_seen': 100303, 'train_runtime': '52.84', 'train_tokens_per_second': '1898'}
+{'loss': '1.804', 'grad_norm': '0.3758', 'learning_rate': '2.991e-06', 'epoch': '0.006105', 'num_input_tokens_seen': 102350, 'train_runtime': '53.87', 'train_tokens_per_second': '1900'}
+{'loss': '1.883', 'grad_norm': '0.3822', 'learning_rate': '3.053e-06', 'epoch': '0.006227', 'num_input_tokens_seen': 104397, 'train_runtime': '54.91', 'train_tokens_per_second': '1901'}
+{'loss': '1.564', 'grad_norm': '0.3343', 'learning_rate': '3.114e-06', 'epoch': '0.006349', 'num_input_tokens_seen': 106444, 'train_runtime': '55.95', 'train_tokens_per_second': '1903'}
+{'loss': '1.817', 'grad_norm': '0.3616', 'learning_rate': '3.175e-06', 'epoch': '0.006471', 'num_input_tokens_seen': 108491, 'train_runtime': '56.99', 'train_tokens_per_second': '1904'}
+{'loss': '1.933', 'grad_norm': '0.5093', 'learning_rate': '3.236e-06', 'epoch': '0.006593', 'num_input_tokens_seen': 110538, 'train_runtime': '58.03', 'train_tokens_per_second': '1905'}
+{'loss': '1.341', 'grad_norm': '0.2829', 'learning_rate': '3.297e-06', 'epoch': '0.006716', 'num_input_tokens_seen': 112585, 'train_runtime': '59.07', 'train_tokens_per_second': '1906'}
+{'loss': '0.8502', 'grad_norm': '0.4346', 'learning_rate': '3.358e-06', 'epoch': '0.006838', 'num_input_tokens_seen': 114632, 'train_runtime': '60.11', 'train_tokens_per_second': '1907'}
+{'loss': '0.7916', 'grad_norm': '0.3189', 'learning_rate': '3.419e-06', 'epoch': '0.00696', 'num_input_tokens_seen': 116679, 'train_runtime': '61.16', 'train_tokens_per_second': '1908'}
+{'loss': '1.727', 'grad_norm': '0.33', 'learning_rate': '3.48e-06', 'epoch': '0.007082', 'num_input_tokens_seen': 118726, 'train_runtime': '62.2', 'train_tokens_per_second': '1909'}
+{'loss': '0.7382', 'grad_norm': '0.2614', 'learning_rate': '3.541e-06', 'epoch': '0.007204', 'num_input_tokens_seen': 120773, 'train_runtime': '63.24', 'train_tokens_per_second': '1910'}
+{'loss': '1.464', 'grad_norm': '0.2572', 'learning_rate': '3.602e-06', 'epoch': '0.007326', 'num_input_tokens_seen': 122820, 'train_runtime': '64.29', 'train_tokens_per_second': '1911'}
+{'loss': '1.883', 'grad_norm': '0.3837', 'learning_rate': '3.663e-06', 'epoch': '0.007448', 'num_input_tokens_seen': 124867, 'train_runtime': '65.32', 'train_tokens_per_second': '1912'}
+{'loss': '0.5972', 'grad_norm': '0.2306', 'learning_rate': '3.724e-06', 'epoch': '0.00757', 'num_input_tokens_seen': 126914, 'train_runtime': '66.36', 'train_tokens_per_second': '1912'}
+{'loss': '1.595', 'grad_norm': '0.2992', 'learning_rate': '3.785e-06', 'epoch': '0.007692', 'num_input_tokens_seen': 128961, 'train_runtime': '67.4', 'train_tokens_per_second': '1913'}
+{'loss': '1.061', 'grad_norm': '0.2566', 'learning_rate': '3.846e-06', 'epoch': '0.007814', 'num_input_tokens_seen': 131008, 'train_runtime': '68.45', 'train_tokens_per_second': '1914'}
+{'loss': '1.625', 'grad_norm': '0.327', 'learning_rate': '3.907e-06', 'epoch': '0.007937', 'num_input_tokens_seen': 133055, 'train_runtime': '69.48', 'train_tokens_per_second': '1915'}
+{'loss': '1.336', 'grad_norm': '0.3814', 'learning_rate': '3.968e-06', 'epoch': '0.008059', 'num_input_tokens_seen': 135102, 'train_runtime': '70.52', 'train_tokens_per_second': '1916'}
+{'loss': '1.048', 'grad_norm': '0.2793', 'learning_rate': '4.029e-06', 'epoch': '0.008181', 'num_input_tokens_seen': 137149, 'train_runtime': '71.56', 'train_tokens_per_second': '1916'}
+{'loss': '1.03', 'grad_norm': '0.2524', 'learning_rate': '4.09e-06', 'epoch': '0.008303', 'num_input_tokens_seen': 139196, 'train_runtime': '72.6', 'train_tokens_per_second': '1917'}
+{'loss': '1.345', 'grad_norm': '0.3772', 'learning_rate': '4.151e-06', 'epoch': '0.008425', 'num_input_tokens_seen': 141243, 'train_runtime': '73.64', 'train_tokens_per_second': '1918'}
+{'loss': '1.542', 'grad_norm': '0.3187', 'learning_rate': '4.212e-06', 'epoch': '0.008547', 'num_input_tokens_seen': 143290, 'train_runtime': '74.68', 'train_tokens_per_second': '1919'}
+{'loss': '1.629', 'grad_norm': '0.3213', 'learning_rate': '4.274e-06', 'epoch': '0.008669', 'num_input_tokens_seen': 145337, 'train_runtime': '75.72', 'train_tokens_per_second': '1919'}
+{'loss': '1.249', 'grad_norm': '0.3192', 'learning_rate': '4.335e-06', 'epoch': '0.008791', 'num_input_tokens_seen': 147384, 'train_runtime': '76.76', 'train_tokens_per_second': '1920'}
+{'loss': '1.306', 'grad_norm': '0.3547', 'learning_rate': '4.396e-06', 'epoch': '0.008913', 'num_input_tokens_seen': 149431, 'train_runtime': '77.8', 'train_tokens_per_second': '1921'}
+{'loss': '0.7812', 'grad_norm': '0.2844', 'learning_rate': '4.457e-06', 'epoch': '0.009035', 'num_input_tokens_seen': 151478, 'train_runtime': '78.84', 'train_tokens_per_second': '1921'}
+{'loss': '1.514', 'grad_norm': '0.3885', 'learning_rate': '4.518e-06', 'epoch': '0.009158', 'num_input_tokens_seen': 153525, 'train_runtime': '79.88', 'train_tokens_per_second': '1922'}
+{'loss': '0.8488', 'grad_norm': '0.3948', 'learning_rate': '4.579e-06', 'epoch': '0.00928', 'num_input_tokens_seen': 155572, 'train_runtime': '80.92', 'train_tokens_per_second': '1922'}
+{'loss': '1.743', 'grad_norm': '0.4191', 'learning_rate': '4.64e-06', 'epoch': '0.009402', 'num_input_tokens_seen': 157619, 'train_runtime': '81.96', 'train_tokens_per_second': '1923'}
+{'loss': '1.392', 'grad_norm': '0.3332', 'learning_rate': '4.701e-06', 'epoch': '0.009524', 'num_input_tokens_seen': 159666, 'train_runtime': '83', 'train_tokens_per_second': '1924'}
+{'loss': '1.175', 'grad_norm': '0.3005', 'learning_rate': '4.762e-06', 'epoch': '0.009646', 'num_input_tokens_seen': 161713, 'train_runtime': '84.04', 'train_tokens_per_second': '1924'}
+{'loss': '0.7409', 'grad_norm': '0.2708', 'learning_rate': '4.823e-06', 'epoch': '0.009768', 'num_input_tokens_seen': 163760, 'train_runtime': '85.08', 'train_tokens_per_second': '1925'}
+{'loss': '1.576', 'grad_norm': '0.345', 'learning_rate': '4.884e-06', 'epoch': '0.00989', 'num_input_tokens_seen': 165807, 'train_runtime': '86.12', 'train_tokens_per_second': '1925'}
+{'loss': '1.075', 'grad_norm': '0.3355', 'learning_rate': '4.945e-06', 'epoch': '0.01001', 'num_input_tokens_seen': 167854, 'train_runtime': '87.16', 'train_tokens_per_second': '1926'}
+{'loss': '1.578', 'grad_norm': '0.3503', 'learning_rate': '5.006e-06', 'epoch': '0.01013', 'num_input_tokens_seen': 169901, 'train_runtime': '88.2', 'train_tokens_per_second': '1926'}
+{'loss': '0.7839', 'grad_norm': '0.3156', 'learning_rate': '5.067e-06', 'epoch': '0.01026', 'num_input_tokens_seen': 171948, 'train_runtime': '89.24', 'train_tokens_per_second': '1927'}
+{'loss': '1.158', 'grad_norm': '0.2784', 'learning_rate': '5.128e-06', 'epoch': '0.01038', 'num_input_tokens_seen': 173995, 'train_runtime': '90.28', 'train_tokens_per_second': '1927'}
+{'loss': '0.9069', 'grad_norm': '0.2335', 'learning_rate': '5.189e-06', 'epoch': '0.0105', 'num_input_tokens_seen': 176042, 'train_runtime': '91.32', 'train_tokens_per_second': '1928'}
+{'loss': '0.7521', 'grad_norm': '0.2156', 'learning_rate': '5.25e-06', 'epoch': '0.01062', 'num_input_tokens_seen': 178089, 'train_runtime': '92.36', 'train_tokens_per_second': '1928'}
+{'loss': '0.8007', 'grad_norm': '0.403', 'learning_rate': '5.311e-06', 'epoch': '0.01074', 'num_input_tokens_seen': 180136, 'train_runtime': '93.4', 'train_tokens_per_second': '1929'}
+{'loss': '1.275', 'grad_norm': '0.3252', 'learning_rate': '5.372e-06', 'epoch': '0.01087', 'num_input_tokens_seen': 182183, 'train_runtime': '94.44', 'train_tokens_per_second': '1929'}
+{'loss': '0.6342', 'grad_norm': '0.2193', 'learning_rate': '5.433e-06', 'epoch': '0.01099', 'num_input_tokens_seen': 184230, 'train_runtime': '95.48', 'train_tokens_per_second': '1930'}
+{'loss': '0.6681', 'grad_norm': '0.2239', 'learning_rate': '5.495e-06', 'epoch': '0.01111', 'num_input_tokens_seen': 186277, 'train_runtime': '96.52', 'train_tokens_per_second': '1930'}
+{'loss': '1.821', 'grad_norm': '0.3542', 'learning_rate': '5.556e-06', 'epoch': '0.01123', 'num_input_tokens_seen': 188324, 'train_runtime': '97.56', 'train_tokens_per_second': '1930'}
+{'loss': '1.281', 'grad_norm': '0.4459', 'learning_rate': '5.617e-06', 'epoch': '0.01136', 'num_input_tokens_seen': 190371, 'train_runtime': '98.6', 'train_tokens_per_second': '1931'}
+{'loss': '0.6506', 'grad_norm': '0.2389', 'learning_rate': '5.678e-06', 'epoch': '0.01148', 'num_input_tokens_seen': 192418, 'train_runtime': '99.64', 'train_tokens_per_second': '1931'}
+{'loss': '0.6121', 'grad_norm': '0.2921', 'learning_rate': '5.739e-06', 'epoch': '0.0116', 'num_input_tokens_seen': 194465, 'train_runtime': '100.7', 'train_tokens_per_second': '1932'}
+{'loss': '1.242', 'grad_norm': '0.3328', 'learning_rate': '5.8e-06', 'epoch': '0.01172', 'num_input_tokens_seen': 196512, 'train_runtime': '101.7', 'train_tokens_per_second': '1932'}
+{'loss': '1.336', 'grad_norm': '0.3499', 'learning_rate': '5.861e-06', 'epoch': '0.01184', 'num_input_tokens_seen': 198559, 'train_runtime': '102.8', 'train_tokens_per_second': '1932'}
+{'loss': '1.114', 'grad_norm': '0.2844', 'learning_rate': '5.922e-06', 'epoch': '0.01197', 'num_input_tokens_seen': 200606, 'train_runtime': '103.8', 'train_tokens_per_second': '1933'}
+{'loss': '1.557', 'grad_norm': '0.406', 'learning_rate': '5.983e-06', 'epoch': '0.01209', 'num_input_tokens_seen': 202653, 'train_runtime': '104.8', 'train_tokens_per_second': '1933'}
+{'loss': '1.696', 'grad_norm': '0.424', 'learning_rate': '6.044e-06', 'epoch': '0.01221', 'num_input_tokens_seen': 204700, 'train_runtime': '105.9', 'train_tokens_per_second': '1933'}
+{'loss': '0.8087', 'grad_norm': '0.2645', 'learning_rate': '6.105e-06', 'epoch': '0.01233', 'num_input_tokens_seen': 206747, 'train_runtime': '106.9', 'train_tokens_per_second': '1934'}
+{'loss': '1.774', 'grad_norm': '0.3712', 'learning_rate': '6.166e-06', 'epoch': '0.01245', 'num_input_tokens_seen': 208794, 'train_runtime': '108', 'train_tokens_per_second': '1934'}
+{'loss': '1.606', 'grad_norm': '0.3108', 'learning_rate': '6.227e-06', 'epoch': '0.01258', 'num_input_tokens_seen': 210841, 'train_runtime': '109', 'train_tokens_per_second': '1934'}
+{'loss': '1.639', 'grad_norm': '0.367', 'learning_rate': '6.288e-06', 'epoch': '0.0127', 'num_input_tokens_seen': 212888, 'train_runtime': '110', 'train_tokens_per_second': '1935'}
+{'loss': '1.371', 'grad_norm': '0.4389', 'learning_rate': '6.349e-06', 'epoch': '0.01282', 'num_input_tokens_seen': 214935, 'train_runtime': '111.1', 'train_tokens_per_second': '1935'}
+{'loss': '1.387', 'grad_norm': '0.3083', 'learning_rate': '6.41e-06', 'epoch': '0.01294', 'num_input_tokens_seen': 216982, 'train_runtime': '112.1', 'train_tokens_per_second': '1935'}
+{'loss': '1.196', 'grad_norm': '0.3401', 'learning_rate': '6.471e-06', 'epoch': '0.01306', 'num_input_tokens_seen': 219029, 'train_runtime': '113.2', 'train_tokens_per_second': '1936'}
+{'loss': '1.118', 'grad_norm': '0.3518', 'learning_rate': '6.532e-06', 'epoch': '0.01319', 'num_input_tokens_seen': 221076, 'train_runtime': '114.2', 'train_tokens_per_second': '1936'}
+{'loss': '1.773', 'grad_norm': '0.3935', 'learning_rate': '6.593e-06', 'epoch': '0.01331', 'num_input_tokens_seen': 223123, 'train_runtime': '115.2', 'train_tokens_per_second': '1936'}
+{'loss': '0.9553', 'grad_norm': '0.287', 'learning_rate': '6.654e-06', 'epoch': '0.01343', 'num_input_tokens_seen': 225170, 'train_runtime': '116.3', 'train_tokens_per_second': '1936'}
+{'loss': '1.564', 'grad_norm': '0.3739', 'learning_rate': '6.716e-06', 'epoch': '0.01355', 'num_input_tokens_seen': 227217, 'train_runtime': '117.3', 'train_tokens_per_second': '1937'}
+{'loss': '1.568', 'grad_norm': '0.3468', 'learning_rate': '6.777e-06', 'epoch': '0.01368', 'num_input_tokens_seen': 229264, 'train_runtime': '118.4', 'train_tokens_per_second': '1937'}
+{'loss': '0.706', 'grad_norm': '0.2301', 'learning_rate': '6.838e-06', 'epoch': '0.0138', 'num_input_tokens_seen': 231311, 'train_runtime': '119.4', 'train_tokens_per_second': '1937'}
+{'loss': '1.193', 'grad_norm': '0.35', 'learning_rate': '6.899e-06', 'epoch': '0.01392', 'num_input_tokens_seen': 233358, 'train_runtime': '120.4', 'train_tokens_per_second': '1937'}
+{'loss': '0.7765', 'grad_norm': '0.2343', 'learning_rate': '6.96e-06', 'epoch': '0.01404', 'num_input_tokens_seen': 235405, 'train_runtime': '121.5', 'train_tokens_per_second': '1938'}
+{'loss': '1.459', 'grad_norm': '0.3429', 'learning_rate': '7.021e-06', 'epoch': '0.01416', 'num_input_tokens_seen': 237452, 'train_runtime': '122.5', 'train_tokens_per_second': '1938'}
+{'loss': '0.6124', 'grad_norm': '0.2381', 'learning_rate': '7.082e-06', 'epoch': '0.01429', 'num_input_tokens_seen': 239499, 'train_runtime': '123.6', 'train_tokens_per_second': '1938'}
+{'loss': '1.599', 'grad_norm': '0.2987', 'learning_rate': '7.143e-06', 'epoch': '0.01441', 'num_input_tokens_seen': 241546, 'train_runtime': '124.6', 'train_tokens_per_second': '1938'}
+{'loss': '1.772', 'grad_norm': '0.3923', 'learning_rate': '7.204e-06', 'epoch': '0.01453', 'num_input_tokens_seen': 243593, 'train_runtime': '125.7', 'train_tokens_per_second': '1939'}
+{'loss': '1.539', 'grad_norm': '0.3114', 'learning_rate': '7.265e-06', 'epoch': '0.01465', 'num_input_tokens_seen': 245640, 'train_runtime': '126.7', 'train_tokens_per_second': '1939'}
+{'loss': '0.7979', 'grad_norm': '0.2755', 'learning_rate': '7.326e-06', 'epoch': '0.01477', 'num_input_tokens_seen': 247687, 'train_runtime': '127.8', 'train_tokens_per_second': '1939'}
+{'loss': '1.567', 'grad_norm': '0.3734', 'learning_rate': '7.387e-06', 'epoch': '0.0149', 'num_input_tokens_seen': 249734, 'train_runtime': '128.8', 'train_tokens_per_second': '1939'}
+{'loss': '1.784', 'grad_norm': '0.3785', 'learning_rate': '7.448e-06', 'epoch': '0.01502', 'num_input_tokens_seen': 251781, 'train_runtime': '129.9', 'train_tokens_per_second': '1939'}
+{'loss': '0.7357', 'grad_norm': '0.253', 'learning_rate': '7.509e-06', 'epoch': '0.01514', 'num_input_tokens_seen': 253828, 'train_runtime': '130.9', 'train_tokens_per_second': '1939'}
+{'loss': '1.653', 'grad_norm': '0.377', 'learning_rate': '7.57e-06', 'epoch': '0.01526', 'num_input_tokens_seen': 255875, 'train_runtime': '131.9', 'train_tokens_per_second': '1939'}
+{'loss': '1.618', 'grad_norm': '0.403', 'learning_rate': '7.631e-06', 'epoch': '0.01538', 'num_input_tokens_seen': 257922, 'train_runtime': '133', 'train_tokens_per_second': '1939'}
+{'loss': '1.129', 'grad_norm': '0.3196', 'learning_rate': '7.692e-06', 'epoch': '0.01551', 'num_input_tokens_seen': 259969, 'train_runtime': '134', 'train_tokens_per_second': '1940'}
+{'loss': '1.338', 'grad_norm': '0.3365', 'learning_rate': '7.753e-06', 'epoch': '0.01563', 'num_input_tokens_seen': 262016, 'train_runtime': '135.1', 'train_tokens_per_second': '1940'}
+{'loss': '0.765', 'grad_norm': '0.2496', 'learning_rate': '7.814e-06', 'epoch': '0.01575', 'num_input_tokens_seen': 264063, 'train_runtime': '136.1', 'train_tokens_per_second': '1940'}
+{'loss': '1.374', 'grad_norm': '0.3893', 'learning_rate': '7.875e-06', 'epoch': '0.01587', 'num_input_tokens_seen': 266110, 'train_runtime': '137.2', 'train_tokens_per_second': '1940'}
+{'loss': '0.5615', 'grad_norm': '0.2409', 'learning_rate': '7.937e-06', 'epoch': '0.016', 'num_input_tokens_seen': 268157, 'train_runtime': '138.2', 'train_tokens_per_second': '1940'}
+{'loss': '1.336', 'grad_norm': '0.4831', 'learning_rate': '7.998e-06', 'epoch': '0.01612', 'num_input_tokens_seen': 270204, 'train_runtime': '139.3', 'train_tokens_per_second': '1940'}
+{'loss': '1.064', 'grad_norm': '0.2854', 'learning_rate': '8.059e-06', 'epoch': '0.01624', 'num_input_tokens_seen': 272251, 'train_runtime': '140.3', 'train_tokens_per_second': '1941'}
+{'loss': '1.798', 'grad_norm': '0.4435', 'learning_rate': '8.12e-06', 'epoch': '0.01636', 'num_input_tokens_seen': 274298, 'train_runtime': '141.3', 'train_tokens_per_second': '1941'}
+{'loss': '1.663', 'grad_norm': '0.3327', 'learning_rate': '8.181e-06', 'epoch': '0.01648', 'num_input_tokens_seen': 276345, 'train_runtime': '142.4', 'train_tokens_per_second': '1941'}
+{'loss': '1.236', 'grad_norm': '0.2907', 'learning_rate': '8.242e-06', 'epoch': '0.01661', 'num_input_tokens_seen': 278392, 'train_runtime': '143.4', 'train_tokens_per_second': '1941'}
+{'loss': '1.616', 'grad_norm': '0.3805', 'learning_rate': '8.303e-06', 'epoch': '0.01673', 'num_input_tokens_seen': 280439, 'train_runtime': '144.5', 'train_tokens_per_second': '1941'}
+{'loss': '1.091', 'grad_norm': '0.3047', 'learning_rate': '8.364e-06', 'epoch': '0.01685', 'num_input_tokens_seen': 282486, 'train_runtime': '145.5', 'train_tokens_per_second': '1941'}
+{'loss': '1.119', 'grad_norm': '0.3349', 'learning_rate': '8.425e-06', 'epoch': '0.01697', 'num_input_tokens_seen': 284533, 'train_runtime': '146.5', 'train_tokens_per_second': '1942'}
+{'loss': '1.409', 'grad_norm': '0.3579', 'learning_rate': '8.486e-06', 'epoch': '0.01709', 'num_input_tokens_seen': 286580, 'train_runtime': '147.6', 'train_tokens_per_second': '1942'}
+{'loss': '1.174', 'grad_norm': '0.3325', 'learning_rate': '8.547e-06', 'epoch': '0.01722', 'num_input_tokens_seen': 288627, 'train_runtime': '148.6', 'train_tokens_per_second': '1942'}
+{'loss': '1.619', 'grad_norm': '0.3922', 'learning_rate': '8.608e-06', 'epoch': '0.01734', 'num_input_tokens_seen': 290674, 'train_runtime': '149.7', 'train_tokens_per_second': '1942'}
+{'loss': '1.246', 'grad_norm': '0.3678', 'learning_rate': '8.669e-06', 'epoch': '0.01746', 'num_input_tokens_seen': 292721, 'train_runtime': '150.7', 'train_tokens_per_second': '1942'}
+{'loss': '1.872', 'grad_norm': '0.4983', 'learning_rate': '8.73e-06', 'epoch': '0.01758', 'num_input_tokens_seen': 294768, 'train_runtime': '151.7', 'train_tokens_per_second': '1943'}
+{'loss': '0.6529', 'grad_norm': '0.2682', 'learning_rate': '8.791e-06', 'epoch': '0.0177', 'num_input_tokens_seen': 296815, 'train_runtime': '152.8', 'train_tokens_per_second': '1943'}
+{'loss': '1.417', 'grad_norm': '0.4103', 'learning_rate': '8.852e-06', 'epoch': '0.01783', 'num_input_tokens_seen': 298862, 'train_runtime': '153.8', 'train_tokens_per_second': '1943'}
+{'loss': '1.427', 'grad_norm': '0.3734', 'learning_rate': '8.913e-06', 'epoch': '0.01795', 'num_input_tokens_seen': 300909, 'train_runtime': '154.9', 'train_tokens_per_second': '1943'}
+{'loss': '1.003', 'grad_norm': '0.3332', 'learning_rate': '8.974e-06', 'epoch': '0.01807', 'num_input_tokens_seen': 302956, 'train_runtime': '155.9', 'train_tokens_per_second': '1943'}
+{'loss': '1.532', 'grad_norm': '0.4219', 'learning_rate': '9.035e-06', 'epoch': '0.01819', 'num_input_tokens_seen': 305003, 'train_runtime': '157', 'train_tokens_per_second': '1943'}
+{'loss': '1.637', 'grad_norm': '0.4259', 'learning_rate': '9.096e-06', 'epoch': '0.01832', 'num_input_tokens_seen': 307050, 'train_runtime': '158', 'train_tokens_per_second': '1943'}
+{'loss': '1.504', 'grad_norm': '0.358', 'learning_rate': '9.158e-06', 'epoch': '0.01844', 'num_input_tokens_seen': 309097, 'train_runtime': '159', 'train_tokens_per_second': '1943'}
+{'loss': '1.709', 'grad_norm': '0.3905', 'learning_rate': '9.219e-06', 'epoch': '0.01856', 'num_input_tokens_seen': 311144, 'train_runtime': '160.1', 'train_tokens_per_second': '1944'}
+{'loss': '1.369', 'grad_norm': '0.3816', 'learning_rate': '9.28e-06', 'epoch': '0.01868', 'num_input_tokens_seen': 313191, 'train_runtime': '161.1', 'train_tokens_per_second': '1944'}
+{'loss': '1.261', 'grad_norm': '0.3293', 'learning_rate': '9.341e-06', 'epoch': '0.0188', 'num_input_tokens_seen': 315238, 'train_runtime': '162.2', 'train_tokens_per_second': '1944'}
+{'loss': '1.141', 'grad_norm': '0.2806', 'learning_rate': '9.402e-06', 'epoch': '0.01893', 'num_input_tokens_seen': 317285, 'train_runtime': '163.2', 'train_tokens_per_second': '1944'}
+{'loss': '1.054', 'grad_norm': '0.3715', 'learning_rate': '9.463e-06', 'epoch': '0.01905', 'num_input_tokens_seen': 319332, 'train_runtime': '164.3', 'train_tokens_per_second': '1944'}
+{'loss': '1.37', 'grad_norm': '0.3663', 'learning_rate': '9.524e-06', 'epoch': '0.01917', 'num_input_tokens_seen': 321379, 'train_runtime': '165.3', 'train_tokens_per_second': '1944'}
+{'loss': '1.426', 'grad_norm': '0.5083', 'learning_rate': '9.585e-06', 'epoch': '0.01929', 'num_input_tokens_seen': 323426, 'train_runtime': '166.4', 'train_tokens_per_second': '1944'}
+{'loss': '1.088', 'grad_norm': '0.3836', 'learning_rate': '9.646e-06', 'epoch': '0.01941', 'num_input_tokens_seen': 325473, 'train_runtime': '167.4', 'train_tokens_per_second': '1944'}
+{'loss': '1.333', 'grad_norm': '0.4061', 'learning_rate': '9.707e-06', 'epoch': '0.01954', 'num_input_tokens_seen': 327520, 'train_runtime': '168.5', 'train_tokens_per_second': '1944'}
+{'loss': '1.822', 'grad_norm': '0.4357', 'learning_rate': '9.768e-06', 'epoch': '0.01966', 'num_input_tokens_seen': 329567, 'train_runtime': '169.5', 'train_tokens_per_second': '1944'}
+{'loss': '1.696', 'grad_norm': '0.5028', 'learning_rate': '9.829e-06', 'epoch': '0.01978', 'num_input_tokens_seen': 331614, 'train_runtime': '170.5', 'train_tokens_per_second': '1944'}
+{'loss': '1.377', 'grad_norm': '0.3383', 'learning_rate': '9.89e-06', 'epoch': '0.0199', 'num_input_tokens_seen': 333661, 'train_runtime': '171.6', 'train_tokens_per_second': '1945'}
+{'loss': '1.33', 'grad_norm': '0.3506', 'learning_rate': '9.951e-06', 'epoch': '0.02002', 'num_input_tokens_seen': 335708, 'train_runtime': '172.6', 'train_tokens_per_second': '1945'}
+{'loss': '1.319', 'grad_norm': '0.3446', 'learning_rate': '1.001e-05', 'epoch': '0.02015', 'num_input_tokens_seen': 337755, 'train_runtime': '173.7', 'train_tokens_per_second': '1945'}
+{'loss': '0.7777', 'grad_norm': '0.329', 'learning_rate': '1.007e-05', 'epoch': '0.02027', 'num_input_tokens_seen': 339802, 'train_runtime': '174.7', 'train_tokens_per_second': '1945'}
+{'loss': '1.454', 'grad_norm': '0.3612', 'learning_rate': '1.013e-05', 'epoch': '0.02039', 'num_input_tokens_seen': 341849, 'train_runtime': '175.8', 'train_tokens_per_second': '1945'}
+{'loss': '1.898', 'grad_norm': '0.533', 'learning_rate': '1.02e-05', 'epoch': '0.02051', 'num_input_tokens_seen': 343896, 'train_runtime': '176.8', 'train_tokens_per_second': '1945'}
+{'loss': '2.036', 'grad_norm': '0.505', 'learning_rate': '1.026e-05', 'epoch': '0.02063', 'num_input_tokens_seen': 345943, 'train_runtime': '177.8', 'train_tokens_per_second': '1945'}
+{'loss': '1.385', 'grad_norm': '0.3653', 'learning_rate': '1.032e-05', 'epoch': '0.02076', 'num_input_tokens_seen': 347990, 'train_runtime': '178.9', 'train_tokens_per_second': '1945'}
+{'loss': '1.659', 'grad_norm': '0.4241', 'learning_rate': '1.038e-05', 'epoch': '0.02088', 'num_input_tokens_seen': 350037, 'train_runtime': '179.9', 'train_tokens_per_second': '1945'}
+{'loss': '1.679', 'grad_norm': '0.4856', 'learning_rate': '1.044e-05', 'epoch': '0.021', 'num_input_tokens_seen': 352084, 'train_runtime': '181', 'train_tokens_per_second': '1945'}
+{'loss': '1.683', 'grad_norm': '0.5514', 'learning_rate': '1.05e-05', 'epoch': '0.02112', 'num_input_tokens_seen': 354131, 'train_runtime': '182', 'train_tokens_per_second': '1945'}
+{'loss': '1.14', 'grad_norm': '0.3525', 'learning_rate': '1.056e-05', 'epoch': '0.02125', 'num_input_tokens_seen': 356178, 'train_runtime': '183.1', 'train_tokens_per_second': '1946'}
+{'loss': '1.289', 'grad_norm': '0.411', 'learning_rate': '1.062e-05', 'epoch': '0.02137', 'num_input_tokens_seen': 358225, 'train_runtime': '184.1', 'train_tokens_per_second': '1946'}
+{'loss': '1.292', 'grad_norm': '0.3842', 'learning_rate': '1.068e-05', 'epoch': '0.02149', 'num_input_tokens_seen': 360272, 'train_runtime': '185.2', 'train_tokens_per_second': '1946'}
+{'loss': '1.556', 'grad_norm': '0.4754', 'learning_rate': '1.074e-05', 'epoch': '0.02161', 'num_input_tokens_seen': 362319, 'train_runtime': '186.2', 'train_tokens_per_second': '1946'}
+{'loss': '1.217', 'grad_norm': '0.4027', 'learning_rate': '1.081e-05', 'epoch': '0.02173', 'num_input_tokens_seen': 364366, 'train_runtime': '187.3', 'train_tokens_per_second': '1946'}
+{'loss': '1.299', 'grad_norm': '0.3883', 'learning_rate': '1.087e-05', 'epoch': '0.02186', 'num_input_tokens_seen': 366413, 'train_runtime': '188.3', 'train_tokens_per_second': '1946'}
+{'loss': '1.037', 'grad_norm': '0.3734', 'learning_rate': '1.093e-05', 'epoch': '0.02198', 'num_input_tokens_seen': 368460, 'train_runtime': '189.4', 'train_tokens_per_second': '1946'}
+{'loss': '0.6348', 'grad_norm': '0.2862', 'learning_rate': '1.099e-05', 'epoch': '0.0221', 'num_input_tokens_seen': 370507, 'train_runtime': '190.4', 'train_tokens_per_second': '1946'}
+{'loss': '0.6528', 'grad_norm': '0.3172', 'learning_rate': '1.105e-05', 'epoch': '0.02222', 'num_input_tokens_seen': 372554, 'train_runtime': '191.4', 'train_tokens_per_second': '1946'}
+{'loss': '1.187', 'grad_norm': '0.3287', 'learning_rate': '1.111e-05', 'epoch': '0.02234', 'num_input_tokens_seen': 374601, 'train_runtime': '192.5', 'train_tokens_per_second': '1946'}
+{'loss': '1.374', 'grad_norm': '0.4477', 'learning_rate': '1.117e-05', 'epoch': '0.02247', 'num_input_tokens_seen': 376648, 'train_runtime': '193.5', 'train_tokens_per_second': '1946'}
+{'loss': '0.8842', 'grad_norm': '0.3564', 'learning_rate': '1.123e-05', 'epoch': '0.02259', 'num_input_tokens_seen': 378695, 'train_runtime': '194.6', 'train_tokens_per_second': '1946'}
+{'loss': '1.746', 'grad_norm': '0.5357', 'learning_rate': '1.129e-05', 'epoch': '0.02271', 'num_input_tokens_seen': 380742, 'train_runtime': '195.6', 'train_tokens_per_second': '1946'}
+{'loss': '1.602', 'grad_norm': '0.4641', 'learning_rate': '1.136e-05', 'epoch': '0.02283', 'num_input_tokens_seen': 382789, 'train_runtime': '196.7', 'train_tokens_per_second': '1946'}
+{'loss': '1.474', 'grad_norm': '0.4725', 'learning_rate': '1.142e-05', 'epoch': '0.02295', 'num_input_tokens_seen': 384836, 'train_runtime': '197.7', 'train_tokens_per_second': '1947'}
+{'loss': '1.636', 'grad_norm': '0.4804', 'learning_rate': '1.148e-05', 'epoch': '0.02308', 'num_input_tokens_seen': 386883, 'train_runtime': '198.7', 'train_tokens_per_second': '1947'}
+{'loss': '0.817', 'grad_norm': '0.3419', 'learning_rate': '1.154e-05', 'epoch': '0.0232', 'num_input_tokens_seen': 388930, 'train_runtime': '199.8', 'train_tokens_per_second': '1947'}
+{'loss': '1.302', 'grad_norm': '0.4552', 'learning_rate': '1.16e-05', 'epoch': '0.02332', 'num_input_tokens_seen': 390977, 'train_runtime': '200.8', 'train_tokens_per_second': '1947'}
+{'loss': '0.6107', 'grad_norm': '0.2889', 'learning_rate': '1.166e-05', 'epoch': '0.02344', 'num_input_tokens_seen': 393024, 'train_runtime': '201.9', 'train_tokens_per_second': '1947'}
+{'loss': '0.8986', 'grad_norm': '0.3478', 'learning_rate': '1.172e-05', 'epoch': '0.02357', 'num_input_tokens_seen': 395071, 'train_runtime': '202.9', 'train_tokens_per_second': '1947'}
+{'loss': '1.096', 'grad_norm': '0.4148', 'learning_rate': '1.178e-05', 'epoch': '0.02369', 'num_input_tokens_seen': 397118, 'train_runtime': '204', 'train_tokens_per_second': '1947'}
+{'loss': '1.41', 'grad_norm': '0.4401', 'learning_rate': '1.184e-05', 'epoch': '0.02381', 'num_input_tokens_seen': 399165, 'train_runtime': '205', 'train_tokens_per_second': '1947'}
+{'loss': '1.205', 'grad_norm': '0.4643', 'learning_rate': '1.19e-05', 'epoch': '0.02393', 'num_input_tokens_seen': 401212, 'train_runtime': '206', 'train_tokens_per_second': '1947'}
+{'loss': '1.366', 'grad_norm': '0.4238', 'learning_rate': '1.197e-05', 'epoch': '0.02405', 'num_input_tokens_seen': 403259, 'train_runtime': '207.1', 'train_tokens_per_second': '1947'}
+{'loss': '0.993', 'grad_norm': '0.3481', 'learning_rate': '1.203e-05', 'epoch': '0.02418', 'num_input_tokens_seen': 405306, 'train_runtime': '208.1', 'train_tokens_per_second': '1947'}
+{'loss': '1.445', 'grad_norm': '0.4837', 'learning_rate': '1.209e-05', 'epoch': '0.0243', 'num_input_tokens_seen': 407353, 'train_runtime': '209.2', 'train_tokens_per_second': '1947'}
+{'loss': '1.562', 'grad_norm': '0.4731', 'learning_rate': '1.215e-05', 'epoch': '0.02442', 'num_input_tokens_seen': 409400, 'train_runtime': '210.2', 'train_tokens_per_second': '1947'}
+{'loss': '1.273', 'grad_norm': '0.4386', 'learning_rate': '1.221e-05', 'epoch': '0.02454', 'num_input_tokens_seen': 411447, 'train_runtime': '211.3', 'train_tokens_per_second': '1948'}
+{'loss': '1.156', 'grad_norm': '0.4585', 'learning_rate': '1.227e-05', 'epoch': '0.02466', 'num_input_tokens_seen': 413494, 'train_runtime': '212.3', 'train_tokens_per_second': '1948'}
+{'loss': '1.283', 'grad_norm': '0.5704', 'learning_rate': '1.233e-05', 'epoch': '0.02479', 'num_input_tokens_seen': 415541, 'train_runtime': '213.3', 'train_tokens_per_second': '1948'}
+{'loss': '1.569', 'grad_norm': '0.518', 'learning_rate': '1.239e-05', 'epoch': '0.02491', 'num_input_tokens_seen': 417588, 'train_runtime': '214.4', 'train_tokens_per_second': '1948'}
+{'loss': '0.7729', 'grad_norm': '0.3459', 'learning_rate': '1.245e-05', 'epoch': '0.02503', 'num_input_tokens_seen': 419635, 'train_runtime': '215.4', 'train_tokens_per_second': '1948'}
+{'loss': '1.291', 'grad_norm': '0.7312', 'learning_rate': '1.252e-05', 'epoch': '0.02515', 'num_input_tokens_seen': 421682, 'train_runtime': '216.5', 'train_tokens_per_second': '1948'}
+{'loss': '1.418', 'grad_norm': '0.4646', 'learning_rate': '1.258e-05', 'epoch': '0.02527', 'num_input_tokens_seen': 423729, 'train_runtime': '217.5', 'train_tokens_per_second': '1948'}
+{'loss': '1.08', 'grad_norm': '0.3567', 'learning_rate': '1.264e-05', 'epoch': '0.0254', 'num_input_tokens_seen': 425776, 'train_runtime': '218.6', 'train_tokens_per_second': '1948'}
+{'loss': '1.676', 'grad_norm': '0.4932', 'learning_rate': '1.27e-05', 'epoch': '0.02552', 'num_input_tokens_seen': 427823, 'train_runtime': '219.6', 'train_tokens_per_second': '1948'}
+{'loss': '1.681', 'grad_norm': '0.583', 'learning_rate': '1.276e-05', 'epoch': '0.02564', 'num_input_tokens_seen': 429870, 'train_runtime': '220.6', 'train_tokens_per_second': '1948'}
+{'loss': '1.45', 'grad_norm': '0.4864', 'learning_rate': '1.282e-05', 'epoch': '0.02576', 'num_input_tokens_seen': 431917, 'train_runtime': '221.7', 'train_tokens_per_second': '1948'}
+{'loss': '0.9681', 'grad_norm': '0.4069', 'learning_rate': '1.288e-05', 'epoch': '0.02589', 'num_input_tokens_seen': 433964, 'train_runtime': '222.7', 'train_tokens_per_second': '1948'}
+{'loss': '0.625', 'grad_norm': '0.3162', 'learning_rate': '1.294e-05', 'epoch': '0.02601', 'num_input_tokens_seen': 436011, 'train_runtime': '223.8', 'train_tokens_per_second': '1948'}
+{'loss': '0.6339', 'grad_norm': '0.311', 'learning_rate': '1.3e-05', 'epoch': '0.02613', 'num_input_tokens_seen': 438058, 'train_runtime': '224.8', 'train_tokens_per_second': '1948'}
+{'loss': '1.575', 'grad_norm': '0.6165', 'learning_rate': '1.306e-05', 'epoch': '0.02625', 'num_input_tokens_seen': 440105, 'train_runtime': '225.9', 'train_tokens_per_second': '1949'}
+{'loss': '1.584', 'grad_norm': '0.5149', 'learning_rate': '1.313e-05', 'epoch': '0.02637', 'num_input_tokens_seen': 442152, 'train_runtime': '226.9', 'train_tokens_per_second': '1949'}
+{'loss': '1.182', 'grad_norm': '0.4159', 'learning_rate': '1.319e-05', 'epoch': '0.0265', 'num_input_tokens_seen': 444199, 'train_runtime': '227.9', 'train_tokens_per_second': '1949'}
+{'loss': '1.79', 'grad_norm': '0.7241', 'learning_rate': '1.325e-05', 'epoch': '0.02662', 'num_input_tokens_seen': 446246, 'train_runtime': '229', 'train_tokens_per_second': '1949'}
+{'loss': '0.6397', 'grad_norm': '0.333', 'learning_rate': '1.331e-05', 'epoch': '0.02674', 'num_input_tokens_seen': 448293, 'train_runtime': '230', 'train_tokens_per_second': '1949'}
+{'loss': '1.596', 'grad_norm': '0.5754', 'learning_rate': '1.337e-05', 'epoch': '0.02686', 'num_input_tokens_seen': 450340, 'train_runtime': '231.1', 'train_tokens_per_second': '1949'}
+{'loss': '0.8092', 'grad_norm': '0.3567', 'learning_rate': '1.343e-05', 'epoch': '0.02698', 'num_input_tokens_seen': 452387, 'train_runtime': '232.1', 'train_tokens_per_second': '1949'}
+{'loss': '1.481', 'grad_norm': '0.5373', 'learning_rate': '1.349e-05', 'epoch': '0.02711', 'num_input_tokens_seen': 454434, 'train_runtime': '233.2', 'train_tokens_per_second': '1949'}
+{'loss': '1.147', 'grad_norm': '0.4188', 'learning_rate': '1.355e-05', 'epoch': '0.02723', 'num_input_tokens_seen': 456481, 'train_runtime': '234.2', 'train_tokens_per_second': '1949'}
+{'loss': '1.602', 'grad_norm': '0.5181', 'learning_rate': '1.361e-05', 'epoch': '0.02735', 'num_input_tokens_seen': 458528, 'train_runtime': '235.2', 'train_tokens_per_second': '1949'}
+{'loss': '1.538', 'grad_norm': '0.5708', 'learning_rate': '1.368e-05', 'epoch': '0.02747', 'num_input_tokens_seen': 460575, 'train_runtime': '236.3', 'train_tokens_per_second': '1949'}
+{'loss': '0.7203', 'grad_norm': '0.3572', 'learning_rate': '1.374e-05', 'epoch': '0.02759', 'num_input_tokens_seen': 462622, 'train_runtime': '237.3', 'train_tokens_per_second': '1949'}
+{'loss': '1.721', 'grad_norm': '0.5984', 'learning_rate': '1.38e-05', 'epoch': '0.02772', 'num_input_tokens_seen': 464669, 'train_runtime': '238.4', 'train_tokens_per_second': '1949'}
+{'loss': '1.278', 'grad_norm': '0.5072', 'learning_rate': '1.386e-05', 'epoch': '0.02784', 'num_input_tokens_seen': 466716, 'train_runtime': '239.4', 'train_tokens_per_second': '1949'}
+{'loss': '1.659', 'grad_norm': '0.6456', 'learning_rate': '1.392e-05', 'epoch': '0.02796', 'num_input_tokens_seen': 468763, 'train_runtime': '240.5', 'train_tokens_per_second': '1949'}
+{'loss': '1.554', 'grad_norm': '0.4676', 'learning_rate': '1.398e-05', 'epoch': '0.02808', 'num_input_tokens_seen': 470810, 'train_runtime': '241.5', 'train_tokens_per_second': '1949'}
+{'loss': '1.236', 'grad_norm': '0.5192', 'learning_rate': '1.404e-05', 'epoch': '0.02821', 'num_input_tokens_seen': 472857, 'train_runtime': '242.6', 'train_tokens_per_second': '1949'}
+{'loss': '1.66', 'grad_norm': '0.5697', 'learning_rate': '1.41e-05', 'epoch': '0.02833', 'num_input_tokens_seen': 474904, 'train_runtime': '243.6', 'train_tokens_per_second': '1950'}
+{'loss': '1.295', 'grad_norm': '0.4977', 'learning_rate': '1.416e-05', 'epoch': '0.02845', 'num_input_tokens_seen': 476951, 'train_runtime': '244.6', 'train_tokens_per_second': '1950'}
+{'loss': '1.356', 'grad_norm': '0.5231', 'learning_rate': '1.422e-05', 'epoch': '0.02857', 'num_input_tokens_seen': 478998, 'train_runtime': '245.7', 'train_tokens_per_second': '1950'}
+{'loss': '1.327', 'grad_norm': '0.5909', 'learning_rate': '1.429e-05', 'epoch': '0.02869', 'num_input_tokens_seen': 481045, 'train_runtime': '246.7', 'train_tokens_per_second': '1950'}
+{'loss': '0.8153', 'grad_norm': '0.4576', 'learning_rate': '1.435e-05', 'epoch': '0.02882', 'num_input_tokens_seen': 483092, 'train_runtime': '247.8', 'train_tokens_per_second': '1950'}
+{'loss': '1.099', 'grad_norm': '0.428', 'learning_rate': '1.441e-05', 'epoch': '0.02894', 'num_input_tokens_seen': 485139, 'train_runtime': '248.8', 'train_tokens_per_second': '1950'}
+{'loss': '1.441', 'grad_norm': '0.5931', 'learning_rate': '1.447e-05', 'epoch': '0.02906', 'num_input_tokens_seen': 487186, 'train_runtime': '249.9', 'train_tokens_per_second': '1950'}
+{'loss': '0.6164', 'grad_norm': '0.3841', 'learning_rate': '1.453e-05', 'epoch': '0.02918', 'num_input_tokens_seen': 489233, 'train_runtime': '250.9', 'train_tokens_per_second': '1950'}
+{'loss': '1.101', 'grad_norm': '0.4834', 'learning_rate': '1.459e-05', 'epoch': '0.0293', 'num_input_tokens_seen': 491280, 'train_runtime': '251.9', 'train_tokens_per_second': '1950'}
+{'loss': '1.662', 'grad_norm': '0.75', 'learning_rate': '1.465e-05', 'epoch': '0.02943', 'num_input_tokens_seen': 493327, 'train_runtime': '253', 'train_tokens_per_second': '1950'}
+{'loss': '0.7168', 'grad_norm': '0.4067', 'learning_rate': '1.471e-05', 'epoch': '0.02955', 'num_input_tokens_seen': 495374, 'train_runtime': '254', 'train_tokens_per_second': '1950'}
+{'loss': '2.19', 'grad_norm': '0.7366', 'learning_rate': '1.477e-05', 'epoch': '0.02967', 'num_input_tokens_seen': 497421, 'train_runtime': '255.1', 'train_tokens_per_second': '1950'}
+{'loss': '1.161', 'grad_norm': '0.5377', 'learning_rate': '1.484e-05', 'epoch': '0.02979', 'num_input_tokens_seen': 499468, 'train_runtime': '256.1', 'train_tokens_per_second': '1950'}
+{'loss': '0.7109', 'grad_norm': '0.3639', 'learning_rate': '1.49e-05', 'epoch': '0.02991', 'num_input_tokens_seen': 501515, 'train_runtime': '257.2', 'train_tokens_per_second': '1950'}
+{'loss': '1.674', 'grad_norm': '0.5716', 'learning_rate': '1.496e-05', 'epoch': '0.03004', 'num_input_tokens_seen': 503562, 'train_runtime': '258.2', 'train_tokens_per_second': '1950'}
+{'loss': '1.543', 'grad_norm': '0.6212', 'learning_rate': '1.502e-05', 'epoch': '0.03016', 'num_input_tokens_seen': 505609, 'train_runtime': '259.3', 'train_tokens_per_second': '1950'}
+{'loss': '1.2', 'grad_norm': '0.5488', 'learning_rate': '1.508e-05', 'epoch': '0.03028', 'num_input_tokens_seen': 507656, 'train_runtime': '260.3', 'train_tokens_per_second': '1950'}
+{'loss': '1.309', 'grad_norm': '0.4928', 'learning_rate': '1.514e-05', 'epoch': '0.0304', 'num_input_tokens_seen': 509703, 'train_runtime': '261.4', 'train_tokens_per_second': '1950'}
+{'loss': '1.394', 'grad_norm': '0.4938', 'learning_rate': '1.52e-05', 'epoch': '0.03053', 'num_input_tokens_seen': 511750, 'train_runtime': '262.4', 'train_tokens_per_second': '1950'}
+{'loss': '1.185', 'grad_norm': '0.4851', 'learning_rate': '1.526e-05', 'epoch': '0.03065', 'num_input_tokens_seen': 513797, 'train_runtime': '263.5', 'train_tokens_per_second': '1950'}
+{'loss': '1.614', 'grad_norm': '0.7283', 'learning_rate': '1.532e-05', 'epoch': '0.03077', 'num_input_tokens_seen': 515844, 'train_runtime': '264.5', 'train_tokens_per_second': '1950'}
+{'loss': '1.91', 'grad_norm': '0.7521', 'learning_rate': '1.538e-05', 'epoch': '0.03089', 'num_input_tokens_seen': 517891, 'train_runtime': '265.6', 'train_tokens_per_second': '1950'}
+{'loss': '1.459', 'grad_norm': '0.6243', 'learning_rate': '1.545e-05', 'epoch': '0.03101', 'num_input_tokens_seen': 519938, 'train_runtime': '266.6', 'train_tokens_per_second': '1950'}
+{'loss': '1.593', 'grad_norm': '0.6734', 'learning_rate': '1.551e-05', 'epoch': '0.03114', 'num_input_tokens_seen': 521985, 'train_runtime': '267.6', 'train_tokens_per_second': '1950'}
+{'loss': '0.6935', 'grad_norm': '0.4742', 'learning_rate': '1.557e-05', 'epoch': '0.03126', 'num_input_tokens_seen': 524032, 'train_runtime': '268.7', 'train_tokens_per_second': '1950'}
+{'loss': '2.115', 'grad_norm': '0.7634', 'learning_rate': '1.563e-05', 'epoch': '0.03138', 'num_input_tokens_seen': 526079, 'train_runtime': '269.7', 'train_tokens_per_second': '1950'}
+{'loss': '1.121', 'grad_norm': '0.478', 'learning_rate': '1.569e-05', 'epoch': '0.0315', 'num_input_tokens_seen': 528126, 'train_runtime': '270.8', 'train_tokens_per_second': '1950'}
+{'loss': '1.432', 'grad_norm': '0.6416', 'learning_rate': '1.575e-05', 'epoch': '0.03162', 'num_input_tokens_seen': 530173, 'train_runtime': '271.8', 'train_tokens_per_second': '1950'}
+{'loss': '0.737', 'grad_norm': '0.4171', 'learning_rate': '1.581e-05', 'epoch': '0.03175', 'num_input_tokens_seen': 532220, 'train_runtime': '272.9', 'train_tokens_per_second': '1950'}
+{'loss': '1.551', 'grad_norm': '0.6279', 'learning_rate': '1.587e-05', 'epoch': '0.03187', 'num_input_tokens_seen': 534267, 'train_runtime': '273.9', 'train_tokens_per_second': '1951'}
+{'loss': '1.129', 'grad_norm': '0.5384', 'learning_rate': '1.593e-05', 'epoch': '0.03199', 'num_input_tokens_seen': 536314, 'train_runtime': '275', 'train_tokens_per_second': '1951'}
+{'loss': '1.203', 'grad_norm': '0.5447', 'learning_rate': '1.6e-05', 'epoch': '0.03211', 'num_input_tokens_seen': 538361, 'train_runtime': '276', 'train_tokens_per_second': '1951'}
+{'loss': '0.01303', 'grad_norm': '0.06387', 'learning_rate': '1.606e-05', 'epoch': '0.03223', 'num_input_tokens_seen': 540408, 'train_runtime': '277', 'train_tokens_per_second': '1951'}
+{'loss': '1.557', 'grad_norm': '0.6978', 'learning_rate': '1.612e-05', 'epoch': '0.03236', 'num_input_tokens_seen': 542455, 'train_runtime': '278.1', 'train_tokens_per_second': '1951'}
+{'loss': '1.018', 'grad_norm': '0.5121', 'learning_rate': '1.618e-05', 'epoch': '0.03248', 'num_input_tokens_seen': 544502, 'train_runtime': '279.1', 'train_tokens_per_second': '1951'}
+{'loss': '1.58', 'grad_norm': '0.6769', 'learning_rate': '1.624e-05', 'epoch': '0.0326', 'num_input_tokens_seen': 546549, 'train_runtime': '280.2', 'train_tokens_per_second': '1951'}
+{'loss': '0.7888', 'grad_norm': '0.4703', 'learning_rate': '1.63e-05', 'epoch': '0.03272', 'num_input_tokens_seen': 548596, 'train_runtime': '281.2', 'train_tokens_per_second': '1951'}
+{'loss': '1.311', 'grad_norm': '0.5751', 'learning_rate': '1.636e-05', 'epoch': '0.03284', 'num_input_tokens_seen': 550643, 'train_runtime': '282.3', 'train_tokens_per_second': '1951'}
+{'loss': '1.315', 'grad_norm': '0.6388', 'learning_rate': '1.642e-05', 'epoch': '0.03297', 'num_input_tokens_seen': 552690, 'train_runtime': '283.3', 'train_tokens_per_second': '1951'}
+{'loss': '0.7666', 'grad_norm': '0.4632', 'learning_rate': '1.648e-05', 'epoch': '0.03309', 'num_input_tokens_seen': 554737, 'train_runtime': '284.4', 'train_tokens_per_second': '1951'}
+{'loss': '1.125', 'grad_norm': '0.6016', 'learning_rate': '1.654e-05', 'epoch': '0.03321', 'num_input_tokens_seen': 556784, 'train_runtime': '285.4', 'train_tokens_per_second': '1951'}
+{'loss': '1.318', 'grad_norm': '0.6857', 'learning_rate': '1.661e-05', 'epoch': '0.03333', 'num_input_tokens_seen': 558831, 'train_runtime': '286.4', 'train_tokens_per_second': '1951'}
+{'loss': '1.422', 'grad_norm': '0.6597', 'learning_rate': '1.667e-05', 'epoch': '0.03346', 'num_input_tokens_seen': 560878, 'train_runtime': '287.5', 'train_tokens_per_second': '1951'}
+{'loss': '1.624', 'grad_norm': '0.7202', 'learning_rate': '1.673e-05', 'epoch': '0.03358', 'num_input_tokens_seen': 562925, 'train_runtime': '288.5', 'train_tokens_per_second': '1951'}
+{'loss': '1.189', 'grad_norm': '0.6913', 'learning_rate': '1.679e-05', 'epoch': '0.0337', 'num_input_tokens_seen': 564972, 'train_runtime': '289.6', 'train_tokens_per_second': '1951'}
+{'loss': '0.6437', 'grad_norm': '0.5177', 'learning_rate': '1.685e-05', 'epoch': '0.03382', 'num_input_tokens_seen': 567019, 'train_runtime': '290.6', 'train_tokens_per_second': '1951'}
+{'loss': '0.8028', 'grad_norm': '0.4772', 'learning_rate': '1.691e-05', 'epoch': '0.03394', 'num_input_tokens_seen': 569066, 'train_runtime': '291.7', 'train_tokens_per_second': '1951'}
+{'loss': '1.109', 'grad_norm': '0.5489', 'learning_rate': '1.697e-05', 'epoch': '0.03407', 'num_input_tokens_seen': 571113, 'train_runtime': '292.7', 'train_tokens_per_second': '1951'}
+{'loss': '1.318', 'grad_norm': '0.7151', 'learning_rate': '1.703e-05', 'epoch': '0.03419', 'num_input_tokens_seen': 573160, 'train_runtime': '293.8', 'train_tokens_per_second': '1951'}
+{'loss': '1.364', 'grad_norm': '0.6679', 'learning_rate': '1.709e-05', 'epoch': '0.03431', 'num_input_tokens_seen': 575207, 'train_runtime': '294.8', 'train_tokens_per_second': '1951'}
+{'loss': '1.553', 'grad_norm': '0.7686', 'learning_rate': '1.716e-05', 'epoch': '0.03443', 'num_input_tokens_seen': 577254, 'train_runtime': '295.8', 'train_tokens_per_second': '1951'}
+{'loss': '0.7742', 'grad_norm': '0.5299', 'learning_rate': '1.722e-05', 'epoch': '0.03455', 'num_input_tokens_seen': 579301, 'train_runtime': '296.9', 'train_tokens_per_second': '1951'}
+{'loss': '0.6421', 'grad_norm': '0.4721', 'learning_rate': '1.728e-05', 'epoch': '0.03468', 'num_input_tokens_seen': 581348, 'train_runtime': '297.9', 'train_tokens_per_second': '1951'}
+{'loss': '1.299', 'grad_norm': '0.7587', 'learning_rate': '1.734e-05', 'epoch': '0.0348', 'num_input_tokens_seen': 583395, 'train_runtime': '299', 'train_tokens_per_second': '1951'}
+{'loss': '1.368', 'grad_norm': '0.7599', 'learning_rate': '1.74e-05', 'epoch': '0.03492', 'num_input_tokens_seen': 585442, 'train_runtime': '300', 'train_tokens_per_second': '1951'}
+{'loss': '1.21', 'grad_norm': '1.106', 'learning_rate': '1.746e-05', 'epoch': '0.03504', 'num_input_tokens_seen': 587489, 'train_runtime': '301', 'train_tokens_per_second': '1951'}
+{'loss': '1.371', 'grad_norm': '0.7232', 'learning_rate': '1.752e-05', 'epoch': '0.03516', 'num_input_tokens_seen': 589536, 'train_runtime': '302.1', 'train_tokens_per_second': '1951'}
+{'loss': '0.6649', 'grad_norm': '0.5242', 'learning_rate': '1.758e-05', 'epoch': '0.03529', 'num_input_tokens_seen': 591583, 'train_runtime': '303.1', 'train_tokens_per_second': '1952'}
+{'loss': '1.356', 'grad_norm': '0.6755', 'learning_rate': '1.764e-05', 'epoch': '0.03541', 'num_input_tokens_seen': 593630, 'train_runtime': '304.2', 'train_tokens_per_second': '1952'}
+{'loss': '0.01393', 'grad_norm': '0.07414', 'learning_rate': '1.77e-05', 'epoch': '0.03553', 'num_input_tokens_seen': 595677, 'train_runtime': '305.2', 'train_tokens_per_second': '1952'}
+{'loss': '0.7759', 'grad_norm': '0.5154', 'learning_rate': '1.777e-05', 'epoch': '0.03565', 'num_input_tokens_seen': 597724, 'train_runtime': '306.3', 'train_tokens_per_second': '1952'}
+{'loss': '1.218', 'grad_norm': '0.6359', 'learning_rate': '1.783e-05', 'epoch': '0.03578', 'num_input_tokens_seen': 599771, 'train_runtime': '307.3', 'train_tokens_per_second': '1952'}
+{'loss': '1.09', 'grad_norm': '0.6873', 'learning_rate': '1.789e-05', 'epoch': '0.0359', 'num_input_tokens_seen': 601818, 'train_runtime': '308.4', 'train_tokens_per_second': '1952'}
+{'loss': '1.241', 'grad_norm': '0.6851', 'learning_rate': '1.795e-05', 'epoch': '0.03602', 'num_input_tokens_seen': 603865, 'train_runtime': '309.4', 'train_tokens_per_second': '1952'}
+{'loss': '1.213', 'grad_norm': '0.6306', 'learning_rate': '1.801e-05', 'epoch': '0.03614', 'num_input_tokens_seen': 605912, 'train_runtime': '310.4', 'train_tokens_per_second': '1952'}
+{'loss': '1.18', 'grad_norm': '0.7958', 'learning_rate': '1.807e-05', 'epoch': '0.03626', 'num_input_tokens_seen': 607959, 'train_runtime': '311.5', 'train_tokens_per_second': '1952'}
+{'loss': '0.707', 'grad_norm': '0.5156', 'learning_rate': '1.813e-05', 'epoch': '0.03639', 'num_input_tokens_seen': 610006, 'train_runtime': '312.5', 'train_tokens_per_second': '1952'}
+{'loss': '0.6689', 'grad_norm': '0.4688', 'learning_rate': '1.819e-05', 'epoch': '0.03651', 'num_input_tokens_seen': 612053, 'train_runtime': '313.6', 'train_tokens_per_second': '1952'}
+{'loss': '0.6581', 'grad_norm': '0.4796', 'learning_rate': '1.825e-05', 'epoch': '0.03663', 'num_input_tokens_seen': 614100, 'train_runtime': '314.6', 'train_tokens_per_second': '1952'}
+{'loss': '1.461', 'grad_norm': '0.7233', 'learning_rate': '1.832e-05', 'epoch': '0.03675', 'num_input_tokens_seen': 616147, 'train_runtime': '315.7', 'train_tokens_per_second': '1952'}
+{'loss': '1.333', 'grad_norm': '0.7007', 'learning_rate': '1.838e-05', 'epoch': '0.03687', 'num_input_tokens_seen': 618194, 'train_runtime': '316.7', 'train_tokens_per_second': '1952'}
+{'loss': '0.6597', 'grad_norm': '0.4982', 'learning_rate': '1.844e-05', 'epoch': '0.037', 'num_input_tokens_seen': 620241, 'train_runtime': '317.7', 'train_tokens_per_second': '1952'}
+{'loss': '1.492', 'grad_norm': '0.844', 'learning_rate': '1.85e-05', 'epoch': '0.03712', 'num_input_tokens_seen': 622288, 'train_runtime': '318.8', 'train_tokens_per_second': '1952'}
+{'loss': '1.365', 'grad_norm': '0.6764', 'learning_rate': '1.856e-05', 'epoch': '0.03724', 'num_input_tokens_seen': 624335, 'train_runtime': '319.8', 'train_tokens_per_second': '1952'}
+{'loss': '1.042', 'grad_norm': '0.548', 'learning_rate': '1.862e-05', 'epoch': '0.03736', 'num_input_tokens_seen': 626382, 'train_runtime': '320.9', 'train_tokens_per_second': '1952'}
+{'loss': '0.8888', 'grad_norm': '0.5708', 'learning_rate': '1.868e-05', 'epoch': '0.03748', 'num_input_tokens_seen': 628429, 'train_runtime': '321.9', 'train_tokens_per_second': '1952'}
+{'loss': '1', 'grad_norm': '0.7019', 'learning_rate': '1.874e-05', 'epoch': '0.03761', 'num_input_tokens_seen': 630476, 'train_runtime': '323', 'train_tokens_per_second': '1952'}
+{'loss': '1.277', 'grad_norm': '0.7269', 'learning_rate': '1.88e-05', 'epoch': '0.03773', 'num_input_tokens_seen': 632523, 'train_runtime': '324', 'train_tokens_per_second': '1952'}
+{'loss': '1.12', 'grad_norm': '0.7019', 'learning_rate': '1.886e-05', 'epoch': '0.03785', 'num_input_tokens_seen': 634570, 'train_runtime': '325.1', 'train_tokens_per_second': '1952'}
+{'loss': '1.537', 'grad_norm': '0.7843', 'learning_rate': '1.893e-05', 'epoch': '0.03797', 'num_input_tokens_seen': 636617, 'train_runtime': '326.1', 'train_tokens_per_second': '1952'}
+{'loss': '0.9306', 'grad_norm': '0.687', 'learning_rate': '1.899e-05', 'epoch': '0.0381', 'num_input_tokens_seen': 638664, 'train_runtime': '327.1', 'train_tokens_per_second': '1952'}
+{'loss': '1.538', 'grad_norm': '0.9485', 'learning_rate': '1.905e-05', 'epoch': '0.03822', 'num_input_tokens_seen': 640711, 'train_runtime': '328.2', 'train_tokens_per_second': '1952'}
+{'loss': '0.9453', 'grad_norm': '0.6511', 'learning_rate': '1.911e-05', 'epoch': '0.03834', 'num_input_tokens_seen': 642758, 'train_runtime': '329.2', 'train_tokens_per_second': '1952'}
+{'loss': '1.093', 'grad_norm': '0.6228', 'learning_rate': '1.917e-05', 'epoch': '0.03846', 'num_input_tokens_seen': 644805, 'train_runtime': '330.3', 'train_tokens_per_second': '1952'}
+{'loss': '1.554', 'grad_norm': '0.7794', 'learning_rate': '1.923e-05', 'epoch': '0.03858', 'num_input_tokens_seen': 646852, 'train_runtime': '331.3', 'train_tokens_per_second': '1952'}
+{'loss': '1.767', 'grad_norm': '1.014', 'learning_rate': '1.929e-05', 'epoch': '0.03871', 'num_input_tokens_seen': 648899, 'train_runtime': '332.4', 'train_tokens_per_second': '1952'}
+{'loss': '1.603', 'grad_norm': '0.9031', 'learning_rate': '1.935e-05', 'epoch': '0.03883', 'num_input_tokens_seen': 650946, 'train_runtime': '333.4', 'train_tokens_per_second': '1952'}
+{'loss': '1.165', 'grad_norm': '0.7525', 'learning_rate': '1.941e-05', 'epoch': '0.03895', 'num_input_tokens_seen': 652993, 'train_runtime': '334.4', 'train_tokens_per_second': '1952'}
+{'loss': '1.363', 'grad_norm': '0.8083', 'learning_rate': '1.947e-05', 'epoch': '0.03907', 'num_input_tokens_seen': 655040, 'train_runtime': '335.5', 'train_tokens_per_second': '1952'}
+{'loss': '1.223', 'grad_norm': '0.8602', 'learning_rate': '1.954e-05', 'epoch': '0.03919', 'num_input_tokens_seen': 657087, 'train_runtime': '336.5', 'train_tokens_per_second': '1952'}
+{'loss': '1.467', 'grad_norm': '0.9287', 'learning_rate': '1.96e-05', 'epoch': '0.03932', 'num_input_tokens_seen': 659134, 'train_runtime': '337.6', 'train_tokens_per_second': '1953'}
+{'loss': '0.7704', 'grad_norm': '0.6444', 'learning_rate': '1.966e-05', 'epoch': '0.03944', 'num_input_tokens_seen': 661181, 'train_runtime': '338.6', 'train_tokens_per_second': '1953'}
+{'loss': '1.721', 'grad_norm': '0.909', 'learning_rate': '1.972e-05', 'epoch': '0.03956', 'num_input_tokens_seen': 663228, 'train_runtime': '339.7', 'train_tokens_per_second': '1953'}
+{'loss': '0.7316', 'grad_norm': '0.5927', 'learning_rate': '1.978e-05', 'epoch': '0.03968', 'num_input_tokens_seen': 665275, 'train_runtime': '340.7', 'train_tokens_per_second': '1953'}
+{'loss': '1.468', 'grad_norm': '0.848', 'learning_rate': '1.984e-05', 'epoch': '0.0398', 'num_input_tokens_seen': 667322, 'train_runtime': '341.7', 'train_tokens_per_second': '1953'}
+{'loss': '1.318', 'grad_norm': '0.9165', 'learning_rate': '1.99e-05', 'epoch': '0.03993', 'num_input_tokens_seen': 669369, 'train_runtime': '342.8', 'train_tokens_per_second': '1953'}
+{'loss': '0.6481', 'grad_norm': '0.5865', 'learning_rate': '1.996e-05', 'epoch': '0.04005', 'num_input_tokens_seen': 671416, 'train_runtime': '343.8', 'train_tokens_per_second': '1953'}
+{'loss': '0.6066', 'grad_norm': '0.5719', 'learning_rate': '2.002e-05', 'epoch': '0.04017', 'num_input_tokens_seen': 673463, 'train_runtime': '344.9', 'train_tokens_per_second': '1953'}
+{'loss': '1.22', 'grad_norm': '0.7782', 'learning_rate': '2.009e-05', 'epoch': '0.04029', 'num_input_tokens_seen': 675510, 'train_runtime': '345.9', 'train_tokens_per_second': '1953'}
+{'loss': '1.111', 'grad_norm': '0.7964', 'learning_rate': '2.015e-05', 'epoch': '0.04042', 'num_input_tokens_seen': 677557, 'train_runtime': '347', 'train_tokens_per_second': '1953'}
+{'loss': '1.49', 'grad_norm': '0.8499', 'learning_rate': '2.021e-05', 'epoch': '0.04054', 'num_input_tokens_seen': 679604, 'train_runtime': '348', 'train_tokens_per_second': '1953'}
+{'loss': '1.515', 'grad_norm': '0.9592', 'learning_rate': '2.027e-05', 'epoch': '0.04066', 'num_input_tokens_seen': 681651, 'train_runtime': '349', 'train_tokens_per_second': '1953'}
+{'loss': '1.546', 'grad_norm': '1.062', 'learning_rate': '2.033e-05', 'epoch': '0.04078', 'num_input_tokens_seen': 683698, 'train_runtime': '350.1', 'train_tokens_per_second': '1953'}
+{'loss': '1.143', 'grad_norm': '0.7702', 'learning_rate': '2.039e-05', 'epoch': '0.0409', 'num_input_tokens_seen': 685745, 'train_runtime': '351.1', 'train_tokens_per_second': '1953'}
+{'loss': '0.7024', 'grad_norm': '0.608', 'learning_rate': '2.045e-05', 'epoch': '0.04103', 'num_input_tokens_seen': 687792, 'train_runtime': '352.2', 'train_tokens_per_second': '1953'}
+{'loss': '1.281', 'grad_norm': '0.8494', 'learning_rate': '2.051e-05', 'epoch': '0.04115', 'num_input_tokens_seen': 689839, 'train_runtime': '353.2', 'train_tokens_per_second': '1953'}
+{'loss': '0.6537', 'grad_norm': '0.5859', 'learning_rate': '2.057e-05', 'epoch': '0.04127', 'num_input_tokens_seen': 691886, 'train_runtime': '354.2', 'train_tokens_per_second': '1953'}
+{'loss': '0.5869', 'grad_norm': '0.507', 'learning_rate': '2.063e-05', 'epoch': '0.04139', 'num_input_tokens_seen': 693933, 'train_runtime': '355.3', 'train_tokens_per_second': '1953'}
+{'loss': '0.9828', 'grad_norm': '0.753', 'learning_rate': '2.07e-05', 'epoch': '0.04151', 'num_input_tokens_seen': 695980, 'train_runtime': '356.3', 'train_tokens_per_second': '1953'}
+{'loss': '1.034', 'grad_norm': '0.7872', 'learning_rate': '2.076e-05', 'epoch': '0.04164', 'num_input_tokens_seen': 698027, 'train_runtime': '357.4', 'train_tokens_per_second': '1953'}
+{'loss': '1.136', 'grad_norm': '0.841', 'learning_rate': '2.082e-05', 'epoch': '0.04176', 'num_input_tokens_seen': 700074, 'train_runtime': '358.4', 'train_tokens_per_second': '1953'}
+{'loss': '1.029', 'grad_norm': '0.8661', 'learning_rate': '2.088e-05', 'epoch': '0.04188', 'num_input_tokens_seen': 702121, 'train_runtime': '359.5', 'train_tokens_per_second': '1953'}
+{'loss': '1.447', 'grad_norm': '0.8867', 'learning_rate': '2.094e-05', 'epoch': '0.042', 'num_input_tokens_seen': 704168, 'train_runtime': '360.5', 'train_tokens_per_second': '1953'}
+{'loss': '1.149', 'grad_norm': '0.8136', 'learning_rate': '2.1e-05', 'epoch': '0.04212', 'num_input_tokens_seen': 706215, 'train_runtime': '361.5', 'train_tokens_per_second': '1953'}
+{'loss': '1.062', 'grad_norm': '0.8538', 'learning_rate': '2.106e-05', 'epoch': '0.04225', 'num_input_tokens_seen': 708262, 'train_runtime': '362.6', 'train_tokens_per_second': '1953'}
+{'loss': '1.476', 'grad_norm': '1.063', 'learning_rate': '2.112e-05', 'epoch': '0.04237', 'num_input_tokens_seen': 710309, 'train_runtime': '363.6', 'train_tokens_per_second': '1953'}
+{'loss': '1.556', 'grad_norm': '1.099', 'learning_rate': '2.118e-05', 'epoch': '0.04249', 'num_input_tokens_seen': 712356, 'train_runtime': '364.7', 'train_tokens_per_second': '1953'}
+{'loss': '1.58', 'grad_norm': '1.042', 'learning_rate': '2.125e-05', 'epoch': '0.04261', 'num_input_tokens_seen': 714403, 'train_runtime': '365.7', 'train_tokens_per_second': '1953'}
+{'loss': '1.422', 'grad_norm': '0.9363', 'learning_rate': '2.131e-05', 'epoch': '0.04274', 'num_input_tokens_seen': 716450, 'train_runtime': '366.8', 'train_tokens_per_second': '1953'}
+{'loss': '0.7304', 'grad_norm': '0.6753', 'learning_rate': '2.137e-05', 'epoch': '0.04286', 'num_input_tokens_seen': 718497, 'train_runtime': '367.8', 'train_tokens_per_second': '1954'}
+{'loss': '0.6162', 'grad_norm': '0.7258', 'learning_rate': '2.143e-05', 'epoch': '0.04298', 'num_input_tokens_seen': 720544, 'train_runtime': '368.8', 'train_tokens_per_second': '1954'}
+{'loss': '1.107', 'grad_norm': '0.8577', 'learning_rate': '2.149e-05', 'epoch': '0.0431', 'num_input_tokens_seen': 722591, 'train_runtime': '369.9', 'train_tokens_per_second': '1954'}
+{'loss': '0.7794', 'grad_norm': '0.7508', 'learning_rate': '2.155e-05', 'epoch': '0.04322', 'num_input_tokens_seen': 724638, 'train_runtime': '370.9', 'train_tokens_per_second': '1954'}
+{'loss': '0.5031', 'grad_norm': '0.5686', 'learning_rate': '2.161e-05', 'epoch': '0.04335', 'num_input_tokens_seen': 726685, 'train_runtime': '372', 'train_tokens_per_second': '1954'}
+{'loss': '1.096', 'grad_norm': '0.8762', 'learning_rate': '2.167e-05', 'epoch': '0.04347', 'num_input_tokens_seen': 728732, 'train_runtime': '373', 'train_tokens_per_second': '1954'}
+{'loss': '1.7', 'grad_norm': '1.311', 'learning_rate': '2.173e-05', 'epoch': '0.04359', 'num_input_tokens_seen': 730779, 'train_runtime': '374.1', 'train_tokens_per_second': '1954'}
+{'loss': '1.493', 'grad_norm': '1.053', 'learning_rate': '2.179e-05', 'epoch': '0.04371', 'num_input_tokens_seen': 732826, 'train_runtime': '375.1', 'train_tokens_per_second': '1954'}
+{'loss': '1.481', 'grad_norm': '1.015', 'learning_rate': '2.186e-05', 'epoch': '0.04383', 'num_input_tokens_seen': 734873, 'train_runtime': '376.2', 'train_tokens_per_second': '1954'}
+{'loss': '0.6684', 'grad_norm': '0.7169', 'learning_rate': '2.192e-05', 'epoch': '0.04396', 'num_input_tokens_seen': 736920, 'train_runtime': '377.2', 'train_tokens_per_second': '1954'}
+{'loss': '1.017', 'grad_norm': '0.7565', 'learning_rate': '2.198e-05', 'epoch': '0.04408', 'num_input_tokens_seen': 738967, 'train_runtime': '378.2', 'train_tokens_per_second': '1954'}
+{'loss': '0.7118', 'grad_norm': '0.7867', 'learning_rate': '2.204e-05', 'epoch': '0.0442', 'num_input_tokens_seen': 741014, 'train_runtime': '379.3', 'train_tokens_per_second': '1954'}
+{'loss': '1.514', 'grad_norm': '1.231', 'learning_rate': '2.21e-05', 'epoch': '0.04432', 'num_input_tokens_seen': 743061, 'train_runtime': '380.3', 'train_tokens_per_second': '1954'}
+{'loss': '1.372', 'grad_norm': '0.9992', 'learning_rate': '2.216e-05', 'epoch': '0.04444', 'num_input_tokens_seen': 745108, 'train_runtime': '381.4', 'train_tokens_per_second': '1954'}
+{'loss': '0.6875', 'grad_norm': '0.7443', 'learning_rate': '2.222e-05', 'epoch': '0.04457', 'num_input_tokens_seen': 747155, 'train_runtime': '382.4', 'train_tokens_per_second': '1954'}
+{'loss': '1.5', 'grad_norm': '1.089', 'learning_rate': '2.228e-05', 'epoch': '0.04469', 'num_input_tokens_seen': 749202, 'train_runtime': '383.5', 'train_tokens_per_second': '1954'}
+{'loss': '1.108', 'grad_norm': '1.018', 'learning_rate': '2.234e-05', 'epoch': '0.04481', 'num_input_tokens_seen': 751249, 'train_runtime': '384.5', 'train_tokens_per_second': '1954'}
+{'loss': '0.7884', 'grad_norm': '0.9202', 'learning_rate': '2.241e-05', 'epoch': '0.04493', 'num_input_tokens_seen': 753296, 'train_runtime': '385.6', 'train_tokens_per_second': '1954'}
+{'loss': '1.155', 'grad_norm': '0.9054', 'learning_rate': '2.247e-05', 'epoch': '0.04505', 'num_input_tokens_seen': 755343, 'train_runtime': '386.6', 'train_tokens_per_second': '1954'}
+{'loss': '1.238', 'grad_norm': '0.9672', 'learning_rate': '2.253e-05', 'epoch': '0.04518', 'num_input_tokens_seen': 757390, 'train_runtime': '387.6', 'train_tokens_per_second': '1954'}
+{'loss': '1.686', 'grad_norm': '1.292', 'learning_rate': '2.259e-05', 'epoch': '0.0453', 'num_input_tokens_seen': 759437, 'train_runtime': '388.7', 'train_tokens_per_second': '1954'}
+{'loss': '1.173', 'grad_norm': '0.9433', 'learning_rate': '2.265e-05', 'epoch': '0.04542', 'num_input_tokens_seen': 761484, 'train_runtime': '389.7', 'train_tokens_per_second': '1954'}
+{'loss': '1.142', 'grad_norm': '0.932', 'learning_rate': '2.271e-05', 'epoch': '0.04554', 'num_input_tokens_seen': 763531, 'train_runtime': '390.8', 'train_tokens_per_second': '1954'}
+{'loss': '0.6799', 'grad_norm': '0.6995', 'learning_rate': '2.277e-05', 'epoch': '0.04567', 'num_input_tokens_seen': 765578, 'train_runtime': '391.8', 'train_tokens_per_second': '1954'}
+{'loss': '1.097', 'grad_norm': '0.9038', 'learning_rate': '2.283e-05', 'epoch': '0.04579', 'num_input_tokens_seen': 767625, 'train_runtime': '392.9', 'train_tokens_per_second': '1954'}
+{'loss': '1.373', 'grad_norm': '1.087', 'learning_rate': '2.289e-05', 'epoch': '0.04591', 'num_input_tokens_seen': 769672, 'train_runtime': '393.9', 'train_tokens_per_second': '1954'}
+{'loss': '1.311', 'grad_norm': '1.024', 'learning_rate': '2.295e-05', 'epoch': '0.04603', 'num_input_tokens_seen': 771719, 'train_runtime': '394.9', 'train_tokens_per_second': '1954'}
+{'loss': '1.389', 'grad_norm': '1.122', 'learning_rate': '2.302e-05', 'epoch': '0.04615', 'num_input_tokens_seen': 773766, 'train_runtime': '396', 'train_tokens_per_second': '1954'}
+{'loss': '1.387', 'grad_norm': '1.237', 'learning_rate': '2.308e-05', 'epoch': '0.04628', 'num_input_tokens_seen': 775813, 'train_runtime': '397', 'train_tokens_per_second': '1954'}
+{'loss': '1.428', 'grad_norm': '1.192', 'learning_rate': '2.314e-05', 'epoch': '0.0464', 'num_input_tokens_seen': 777860, 'train_runtime': '398.1', 'train_tokens_per_second': '1954'}
+{'loss': '1.465', 'grad_norm': '1.139', 'learning_rate': '2.32e-05', 'epoch': '0.04652', 'num_input_tokens_seen': 779907, 'train_runtime': '399.1', 'train_tokens_per_second': '1954'}
+{'loss': '0.5441', 'grad_norm': '0.5965', 'learning_rate': '2.326e-05', 'epoch': '0.04664', 'num_input_tokens_seen': 781954, 'train_runtime': '400.2', 'train_tokens_per_second': '1954'}
+{'loss': '0.996', 'grad_norm': '0.92', 'learning_rate': '2.332e-05', 'epoch': '0.04676', 'num_input_tokens_seen': 784001, 'train_runtime': '401.2', 'train_tokens_per_second': '1954'}
+{'loss': '1.155', 'grad_norm': '1.001', 'learning_rate': '2.338e-05', 'epoch': '0.04689', 'num_input_tokens_seen': 786048, 'train_runtime': '402.2', 'train_tokens_per_second': '1954'}
+{'loss': '1.485', 'grad_norm': '1.166', 'learning_rate': '2.344e-05', 'epoch': '0.04701', 'num_input_tokens_seen': 788095, 'train_runtime': '403.3', 'train_tokens_per_second': '1954'}
+{'loss': '1.173', 'grad_norm': '1.012', 'learning_rate': '2.35e-05', 'epoch': '0.04713', 'num_input_tokens_seen': 790142, 'train_runtime': '404.3', 'train_tokens_per_second': '1954'}
+{'loss': '1.385', 'grad_norm': '1.233', 'learning_rate': '2.357e-05', 'epoch': '0.04725', 'num_input_tokens_seen': 792189, 'train_runtime': '405.4', 'train_tokens_per_second': '1954'}
+{'loss': '0.7973', 'grad_norm': '0.7914', 'learning_rate': '2.363e-05', 'epoch': '0.04737', 'num_input_tokens_seen': 794236, 'train_runtime': '406.4', 'train_tokens_per_second': '1954'}
+{'loss': '1.45', 'grad_norm': '1.225', 'learning_rate': '2.369e-05', 'epoch': '0.0475', 'num_input_tokens_seen': 796283, 'train_runtime': '407.4', 'train_tokens_per_second': '1954'}
+{'loss': '0.9021', 'grad_norm': '0.7517', 'learning_rate': '2.375e-05', 'epoch': '0.04762', 'num_input_tokens_seen': 798330, 'train_runtime': '408.5', 'train_tokens_per_second': '1954'}
+{'loss': '0.8751', 'grad_norm': '0.8999', 'learning_rate': '2.381e-05', 'epoch': '0.04774', 'num_input_tokens_seen': 800377, 'train_runtime': '409.5', 'train_tokens_per_second': '1954'}
+{'loss': '1.167', 'grad_norm': '0.951', 'learning_rate': '2.387e-05', 'epoch': '0.04786', 'num_input_tokens_seen': 802424, 'train_runtime': '410.6', 'train_tokens_per_second': '1954'}
+{'loss': '1.035', 'grad_norm': '1.208', 'learning_rate': '2.393e-05', 'epoch': '0.04799', 'num_input_tokens_seen': 804471, 'train_runtime': '411.6', 'train_tokens_per_second': '1954'}
+{'loss': '1.479', 'grad_norm': '1.259', 'learning_rate': '2.399e-05', 'epoch': '0.04811', 'num_input_tokens_seen': 806518, 'train_runtime': '412.7', 'train_tokens_per_second': '1954'}
+{'loss': '1.245', 'grad_norm': '0.9555', 'learning_rate': '2.405e-05', 'epoch': '0.04823', 'num_input_tokens_seen': 808565, 'train_runtime': '413.7', 'train_tokens_per_second': '1954'}
+{'loss': '1.056', 'grad_norm': '0.8719', 'learning_rate': '2.411e-05', 'epoch': '0.04835', 'num_input_tokens_seen': 810612, 'train_runtime': '414.7', 'train_tokens_per_second': '1954'}
+{'loss': '1.266', 'grad_norm': '0.9849', 'learning_rate': '2.418e-05', 'epoch': '0.04847', 'num_input_tokens_seen': 812659, 'train_runtime': '415.8', 'train_tokens_per_second': '1955'}
+{'loss': '0.7403', 'grad_norm': '0.9149', 'learning_rate': '2.424e-05', 'epoch': '0.0486', 'num_input_tokens_seen': 814706, 'train_runtime': '416.8', 'train_tokens_per_second': '1955'}
+{'loss': '1.221', 'grad_norm': '1.142', 'learning_rate': '2.43e-05', 'epoch': '0.04872', 'num_input_tokens_seen': 816753, 'train_runtime': '417.9', 'train_tokens_per_second': '1955'}
+{'loss': '0.6936', 'grad_norm': '0.8509', 'learning_rate': '2.436e-05', 'epoch': '0.04884', 'num_input_tokens_seen': 818800, 'train_runtime': '418.9', 'train_tokens_per_second': '1955'}
+{'loss': '0.7235', 'grad_norm': '0.8683', 'learning_rate': '2.442e-05', 'epoch': '0.04896', 'num_input_tokens_seen': 820847, 'train_runtime': '420', 'train_tokens_per_second': '1955'}
+{'loss': '1.459', 'grad_norm': '1.314', 'learning_rate': '2.448e-05', 'epoch': '0.04908', 'num_input_tokens_seen': 822894, 'train_runtime': '421', 'train_tokens_per_second': '1955'}
+{'loss': '1.521', 'grad_norm': '1.346', 'learning_rate': '2.454e-05', 'epoch': '0.04921', 'num_input_tokens_seen': 824941, 'train_runtime': '422', 'train_tokens_per_second': '1955'}
+{'loss': '0.6103', 'grad_norm': '0.7869', 'learning_rate': '2.46e-05', 'epoch': '0.04933', 'num_input_tokens_seen': 826988, 'train_runtime': '423.1', 'train_tokens_per_second': '1955'}
+{'loss': '0.7634', 'grad_norm': '0.9154', 'learning_rate': '2.466e-05', 'epoch': '0.04945', 'num_input_tokens_seen': 829035, 'train_runtime': '424.1', 'train_tokens_per_second': '1955'}
+{'loss': '0.519', 'grad_norm': '0.6597', 'learning_rate': '2.473e-05', 'epoch': '0.04957', 'num_input_tokens_seen': 831082, 'train_runtime': '425.2', 'train_tokens_per_second': '1955'}
+{'loss': '1.216', 'grad_norm': '1.108', 'learning_rate': '2.479e-05', 'epoch': '0.04969', 'num_input_tokens_seen': 833129, 'train_runtime': '426.2', 'train_tokens_per_second': '1955'}
+{'loss': '0.6684', 'grad_norm': '0.7702', 'learning_rate': '2.485e-05', 'epoch': '0.04982', 'num_input_tokens_seen': 835176, 'train_runtime': '427.2', 'train_tokens_per_second': '1955'}
+{'loss': '0.5844', 'grad_norm': '0.7106', 'learning_rate': '2.491e-05', 'epoch': '0.04994', 'num_input_tokens_seen': 837223, 'train_runtime': '428.3', 'train_tokens_per_second': '1955'}
+{'loss': '1.525', 'grad_norm': '1.278', 'learning_rate': '2.497e-05', 'epoch': '0.05006', 'num_input_tokens_seen': 839270, 'train_runtime': '429.3', 'train_tokens_per_second': '1955'}
+{'loss': '1.26', 'grad_norm': '1.193', 'learning_rate': '2.503e-05', 'epoch': '0.05018', 'num_input_tokens_seen': 841317, 'train_runtime': '430.4', 'train_tokens_per_second': '1955'}
+{'loss': '1.383', 'grad_norm': '1.183', 'learning_rate': '2.509e-05', 'epoch': '0.05031', 'num_input_tokens_seen': 843364, 'train_runtime': '431.4', 'train_tokens_per_second': '1955'}
+{'loss': '0.6213', 'grad_norm': '0.8509', 'learning_rate': '2.515e-05', 'epoch': '0.05043', 'num_input_tokens_seen': 845411, 'train_runtime': '432.5', 'train_tokens_per_second': '1955'}
+{'loss': '0.7826', 'grad_norm': '0.8761', 'learning_rate': '2.521e-05', 'epoch': '0.05055', 'num_input_tokens_seen': 847458, 'train_runtime': '433.5', 'train_tokens_per_second': '1955'}
+{'loss': '1.094', 'grad_norm': '0.981', 'learning_rate': '2.527e-05', 'epoch': '0.05067', 'num_input_tokens_seen': 849505, 'train_runtime': '434.6', 'train_tokens_per_second': '1955'}
+{'loss': '1.522', 'grad_norm': '1.446', 'learning_rate': '2.534e-05', 'epoch': '0.05079', 'num_input_tokens_seen': 851552, 'train_runtime': '435.6', 'train_tokens_per_second': '1955'}
+{'loss': '0.7217', 'grad_norm': '0.894', 'learning_rate': '2.54e-05', 'epoch': '0.05092', 'num_input_tokens_seen': 853599, 'train_runtime': '436.6', 'train_tokens_per_second': '1955'}
+{'loss': '0.7068', 'grad_norm': '0.8761', 'learning_rate': '2.546e-05', 'epoch': '0.05104', 'num_input_tokens_seen': 855646, 'train_runtime': '437.7', 'train_tokens_per_second': '1955'}
+{'loss': '1.045', 'grad_norm': '1.079', 'learning_rate': '2.552e-05', 'epoch': '0.05116', 'num_input_tokens_seen': 857693, 'train_runtime': '438.7', 'train_tokens_per_second': '1955'}
+{'loss': '1.271', 'grad_norm': '1.262', 'learning_rate': '2.558e-05', 'epoch': '0.05128', 'num_input_tokens_seen': 859740, 'train_runtime': '439.8', 'train_tokens_per_second': '1955'}
+{'loss': '1.291', 'grad_norm': '1.335', 'learning_rate': '2.564e-05', 'epoch': '0.0514', 'num_input_tokens_seen': 861787, 'train_runtime': '440.8', 'train_tokens_per_second': '1955'}
+{'loss': '0.8024', 'grad_norm': '0.9678', 'learning_rate': '2.57e-05', 'epoch': '0.05153', 'num_input_tokens_seen': 863834, 'train_runtime': '441.9', 'train_tokens_per_second': '1955'}
+{'loss': '1.331', 'grad_norm': '1.301', 'learning_rate': '2.576e-05', 'epoch': '0.05165', 'num_input_tokens_seen': 865881, 'train_runtime': '442.9', 'train_tokens_per_second': '1955'}
+{'loss': '1.514', 'grad_norm': '1.475', 'learning_rate': '2.582e-05', 'epoch': '0.05177', 'num_input_tokens_seen': 867928, 'train_runtime': '444', 'train_tokens_per_second': '1955'}
+{'loss': '0.6242', 'grad_norm': '1.046', 'learning_rate': '2.589e-05', 'epoch': '0.05189', 'num_input_tokens_seen': 869975, 'train_runtime': '445', 'train_tokens_per_second': '1955'}
+{'loss': '1.168', 'grad_norm': '1.187', 'learning_rate': '2.595e-05', 'epoch': '0.05201', 'num_input_tokens_seen': 872022, 'train_runtime': '446.1', 'train_tokens_per_second': '1955'}
+{'loss': '0.6858', 'grad_norm': '0.8636', 'learning_rate': '2.601e-05', 'epoch': '0.05214', 'num_input_tokens_seen': 874069, 'train_runtime': '447.1', 'train_tokens_per_second': '1955'}
+{'loss': '0.6871', 'grad_norm': '1.042', 'learning_rate': '2.607e-05', 'epoch': '0.05226', 'num_input_tokens_seen': 876116, 'train_runtime': '448.1', 'train_tokens_per_second': '1955'}
+{'loss': '1.365', 'grad_norm': '1.48', 'learning_rate': '2.613e-05', 'epoch': '0.05238', 'num_input_tokens_seen': 878163, 'train_runtime': '449.2', 'train_tokens_per_second': '1955'}
+{'loss': '1.283', 'grad_norm': '1.152', 'learning_rate': '2.619e-05', 'epoch': '0.0525', 'num_input_tokens_seen': 880210, 'train_runtime': '450.2', 'train_tokens_per_second': '1955'}
+{'loss': '1.19', 'grad_norm': '1.281', 'learning_rate': '2.625e-05', 'epoch': '0.05263', 'num_input_tokens_seen': 882257, 'train_runtime': '451.3', 'train_tokens_per_second': '1955'}
+{'loss': '1.21', 'grad_norm': '1.092', 'learning_rate': '2.631e-05', 'epoch': '0.05275', 'num_input_tokens_seen': 884304, 'train_runtime': '452.3', 'train_tokens_per_second': '1955'}
+{'loss': '1.383', 'grad_norm': '1.652', 'learning_rate': '2.637e-05', 'epoch': '0.05287', 'num_input_tokens_seen': 886351, 'train_runtime': '453.3', 'train_tokens_per_second': '1955'}
+{'loss': '0.8418', 'grad_norm': '0.9371', 'learning_rate': '2.643e-05', 'epoch': '0.05299', 'num_input_tokens_seen': 888398, 'train_runtime': '454.4', 'train_tokens_per_second': '1955'}
+{'loss': '0.5298', 'grad_norm': '0.874', 'learning_rate': '2.65e-05', 'epoch': '0.05311', 'num_input_tokens_seen': 890445, 'train_runtime': '455.4', 'train_tokens_per_second': '1955'}
+{'loss': '1.225', 'grad_norm': '1.331', 'learning_rate': '2.656e-05', 'epoch': '0.05324', 'num_input_tokens_seen': 892492, 'train_runtime': '456.5', 'train_tokens_per_second': '1955'}
+{'loss': '1.263', 'grad_norm': '1.346', 'learning_rate': '2.662e-05', 'epoch': '0.05336', 'num_input_tokens_seen': 894539, 'train_runtime': '457.5', 'train_tokens_per_second': '1955'}
+{'loss': '1.176', 'grad_norm': '1.117', 'learning_rate': '2.668e-05', 'epoch': '0.05348', 'num_input_tokens_seen': 896586, 'train_runtime': '458.6', 'train_tokens_per_second': '1955'}
+{'loss': '1.284', 'grad_norm': '1.278', 'learning_rate': '2.674e-05', 'epoch': '0.0536', 'num_input_tokens_seen': 898633, 'train_runtime': '459.6', 'train_tokens_per_second': '1955'}
+{'loss': '0.9613', 'grad_norm': '0.9784', 'learning_rate': '2.68e-05', 'epoch': '0.05372', 'num_input_tokens_seen': 900680, 'train_runtime': '460.7', 'train_tokens_per_second': '1955'}
+{'loss': '1.384', 'grad_norm': '1.524', 'learning_rate': '2.686e-05', 'epoch': '0.05385', 'num_input_tokens_seen': 902727, 'train_runtime': '461.7', 'train_tokens_per_second': '1955'}
+{'loss': '1.036', 'grad_norm': '0.9521', 'learning_rate': '2.692e-05', 'epoch': '0.05397', 'num_input_tokens_seen': 904774, 'train_runtime': '462.8', 'train_tokens_per_second': '1955'}
+{'loss': '0.6537', 'grad_norm': '0.9239', 'learning_rate': '2.698e-05', 'epoch': '0.05409', 'num_input_tokens_seen': 906821, 'train_runtime': '463.8', 'train_tokens_per_second': '1955'}
+{'loss': '0.7127', 'grad_norm': '0.9552', 'learning_rate': '2.705e-05', 'epoch': '0.05421', 'num_input_tokens_seen': 908868, 'train_runtime': '464.8', 'train_tokens_per_second': '1955'}
+{'loss': '1.48', 'grad_norm': '1.539', 'learning_rate': '2.711e-05', 'epoch': '0.05433', 'num_input_tokens_seen': 910915, 'train_runtime': '465.9', 'train_tokens_per_second': '1955'}
+{'loss': '1.101', 'grad_norm': '1.102', 'learning_rate': '2.717e-05', 'epoch': '0.05446', 'num_input_tokens_seen': 912962, 'train_runtime': '466.9', 'train_tokens_per_second': '1955'}
+{'loss': '0.6806', 'grad_norm': '0.9991', 'learning_rate': '2.723e-05', 'epoch': '0.05458', 'num_input_tokens_seen': 915009, 'train_runtime': '468', 'train_tokens_per_second': '1955'}
+{'loss': '1.108', 'grad_norm': '1.18', 'learning_rate': '2.729e-05', 'epoch': '0.0547', 'num_input_tokens_seen': 917056, 'train_runtime': '469', 'train_tokens_per_second': '1955'}
+{'loss': '1.053', 'grad_norm': '1.299', 'learning_rate': '2.735e-05', 'epoch': '0.05482', 'num_input_tokens_seen': 919103, 'train_runtime': '470.1', 'train_tokens_per_second': '1955'}
+{'loss': '0.8509', 'grad_norm': '0.8801', 'learning_rate': '2.741e-05', 'epoch': '0.05495', 'num_input_tokens_seen': 921150, 'train_runtime': '471.1', 'train_tokens_per_second': '1955'}
+{'loss': '0.8446', 'grad_norm': '1.211', 'learning_rate': '2.747e-05', 'epoch': '0.05507', 'num_input_tokens_seen': 923197, 'train_runtime': '472.1', 'train_tokens_per_second': '1955'}
+{'loss': '0.587', 'grad_norm': '0.836', 'learning_rate': '2.753e-05', 'epoch': '0.05519', 'num_input_tokens_seen': 925244, 'train_runtime': '473.2', 'train_tokens_per_second': '1955'}
+{'loss': '0.7232', 'grad_norm': '1.055', 'learning_rate': '2.759e-05', 'epoch': '0.05531', 'num_input_tokens_seen': 927291, 'train_runtime': '474.2', 'train_tokens_per_second': '1955'}
+{'loss': '1.315', 'grad_norm': '1.442', 'learning_rate': '2.766e-05', 'epoch': '0.05543', 'num_input_tokens_seen': 929338, 'train_runtime': '475.3', 'train_tokens_per_second': '1955'}
+{'loss': '0.9902', 'grad_norm': '1.218', 'learning_rate': '2.772e-05', 'epoch': '0.05556', 'num_input_tokens_seen': 931385, 'train_runtime': '476.3', 'train_tokens_per_second': '1955'}
+{'loss': '1.305', 'grad_norm': '1.559', 'learning_rate': '2.778e-05', 'epoch': '0.05568', 'num_input_tokens_seen': 933432, 'train_runtime': '477.4', 'train_tokens_per_second': '1955'}
+{'loss': '1.206', 'grad_norm': '1.379', 'learning_rate': '2.784e-05', 'epoch': '0.0558', 'num_input_tokens_seen': 935479, 'train_runtime': '478.4', 'train_tokens_per_second': '1955'}
+{'loss': '0.0123', 'grad_norm': '0.08998', 'learning_rate': '2.79e-05', 'epoch': '0.05592', 'num_input_tokens_seen': 937526, 'train_runtime': '479.4', 'train_tokens_per_second': '1955'}
+{'loss': '1.112', 'grad_norm': '1.656', 'learning_rate': '2.796e-05', 'epoch': '0.05604', 'num_input_tokens_seen': 939573, 'train_runtime': '480.5', 'train_tokens_per_second': '1955'}
+{'loss': '1.147', 'grad_norm': '1.41', 'learning_rate': '2.802e-05', 'epoch': '0.05617', 'num_input_tokens_seen': 941620, 'train_runtime': '481.5', 'train_tokens_per_second': '1956'}
+{'loss': '0.688', 'grad_norm': '1.043', 'learning_rate': '2.808e-05', 'epoch': '0.05629', 'num_input_tokens_seen': 943667, 'train_runtime': '482.6', 'train_tokens_per_second': '1956'}
+{'loss': '1.189', 'grad_norm': '1.356', 'learning_rate': '2.814e-05', 'epoch': '0.05641', 'num_input_tokens_seen': 945714, 'train_runtime': '483.6', 'train_tokens_per_second': '1956'}
+{'loss': '0.4871', 'grad_norm': '0.7695', 'learning_rate': '2.821e-05', 'epoch': '0.05653', 'num_input_tokens_seen': 947761, 'train_runtime': '484.7', 'train_tokens_per_second': '1956'}
+{'loss': '0.9934', 'grad_norm': '1.253', 'learning_rate': '2.827e-05', 'epoch': '0.05665', 'num_input_tokens_seen': 949808, 'train_runtime': '485.7', 'train_tokens_per_second': '1956'}
+{'loss': '0.6', 'grad_norm': '0.8431', 'learning_rate': '2.833e-05', 'epoch': '0.05678', 'num_input_tokens_seen': 951855, 'train_runtime': '486.7', 'train_tokens_per_second': '1956'}
+{'loss': '1.166', 'grad_norm': '1.428', 'learning_rate': '2.839e-05', 'epoch': '0.0569', 'num_input_tokens_seen': 953902, 'train_runtime': '487.8', 'train_tokens_per_second': '1956'}
+{'loss': '0.588', 'grad_norm': '0.9063', 'learning_rate': '2.845e-05', 'epoch': '0.05702', 'num_input_tokens_seen': 955949, 'train_runtime': '488.8', 'train_tokens_per_second': '1956'}
+{'loss': '0.8766', 'grad_norm': '0.994', 'learning_rate': '2.851e-05', 'epoch': '0.05714', 'num_input_tokens_seen': 957996, 'train_runtime': '489.9', 'train_tokens_per_second': '1956'}
+{'loss': '1.262', 'grad_norm': '1.422', 'learning_rate': '2.857e-05', 'epoch': '0.05726', 'num_input_tokens_seen': 960043, 'train_runtime': '490.9', 'train_tokens_per_second': '1956'}
+{'loss': '1.148', 'grad_norm': '1.386', 'learning_rate': '2.863e-05', 'epoch': '0.05739', 'num_input_tokens_seen': 962090, 'train_runtime': '491.9', 'train_tokens_per_second': '1956'}
+{'loss': '1.306', 'grad_norm': '1.835', 'learning_rate': '2.869e-05', 'epoch': '0.05751', 'num_input_tokens_seen': 964137, 'train_runtime': '493', 'train_tokens_per_second': '1956'}
+{'loss': '1.432', 'grad_norm': '1.183', 'learning_rate': '2.875e-05', 'epoch': '0.05763', 'num_input_tokens_seen': 966184, 'train_runtime': '494', 'train_tokens_per_second': '1956'}
+{'loss': '0.7086', 'grad_norm': '1.066', 'learning_rate': '2.882e-05', 'epoch': '0.05775', 'num_input_tokens_seen': 968231, 'train_runtime': '495.1', 'train_tokens_per_second': '1956'}
+{'loss': '1.149', 'grad_norm': '1.413', 'learning_rate': '2.888e-05', 'epoch': '0.05788', 'num_input_tokens_seen': 970278, 'train_runtime': '496.1', 'train_tokens_per_second': '1956'}
+{'loss': '0.52', 'grad_norm': '0.939', 'learning_rate': '2.894e-05', 'epoch': '0.058', 'num_input_tokens_seen': 972325, 'train_runtime': '497.2', 'train_tokens_per_second': '1956'}
+{'loss': '0.649', 'grad_norm': '1.013', 'learning_rate': '2.9e-05', 'epoch': '0.05812', 'num_input_tokens_seen': 974372, 'train_runtime': '498.2', 'train_tokens_per_second': '1956'}
+{'loss': '1.451', 'grad_norm': '1.475', 'learning_rate': '2.906e-05', 'epoch': '0.05824', 'num_input_tokens_seen': 976419, 'train_runtime': '499.3', 'train_tokens_per_second': '1956'}
+{'loss': '1.186', 'grad_norm': '1.553', 'learning_rate': '2.912e-05', 'epoch': '0.05836', 'num_input_tokens_seen': 978466, 'train_runtime': '500.3', 'train_tokens_per_second': '1956'}
+{'loss': '0.6405', 'grad_norm': '1.071', 'learning_rate': '2.918e-05', 'epoch': '0.05849', 'num_input_tokens_seen': 980513, 'train_runtime': '501.3', 'train_tokens_per_second': '1956'}
+{'loss': '1.085', 'grad_norm': '1.42', 'learning_rate': '2.924e-05', 'epoch': '0.05861', 'num_input_tokens_seen': 982560, 'train_runtime': '502.4', 'train_tokens_per_second': '1956'}
+{'loss': '0.4971', 'grad_norm': '0.7741', 'learning_rate': '2.93e-05', 'epoch': '0.05873', 'num_input_tokens_seen': 984607, 'train_runtime': '503.4', 'train_tokens_per_second': '1956'}
+{'loss': '0.8035', 'grad_norm': '1.318', 'learning_rate': '2.937e-05', 'epoch': '0.05885', 'num_input_tokens_seen': 986654, 'train_runtime': '504.5', 'train_tokens_per_second': '1956'}
+{'loss': '1.381', 'grad_norm': '1.516', 'learning_rate': '2.943e-05', 'epoch': '0.05897', 'num_input_tokens_seen': 988701, 'train_runtime': '505.5', 'train_tokens_per_second': '1956'}
+{'loss': '1.114', 'grad_norm': '1.394', 'learning_rate': '2.949e-05', 'epoch': '0.0591', 'num_input_tokens_seen': 990748, 'train_runtime': '506.6', 'train_tokens_per_second': '1956'}
+{'loss': '1.29', 'grad_norm': '4.55', 'learning_rate': '2.955e-05', 'epoch': '0.05922', 'num_input_tokens_seen': 992795, 'train_runtime': '507.6', 'train_tokens_per_second': '1956'}
+{'loss': '0.52', 'grad_norm': '0.8739', 'learning_rate': '2.961e-05', 'epoch': '0.05934', 'num_input_tokens_seen': 994842, 'train_runtime': '508.6', 'train_tokens_per_second': '1956'}
+{'loss': '0.6595', 'grad_norm': '0.9419', 'learning_rate': '2.967e-05', 'epoch': '0.05946', 'num_input_tokens_seen': 996889, 'train_runtime': '509.7', 'train_tokens_per_second': '1956'}
+{'loss': '1.374', 'grad_norm': '1.408', 'learning_rate': '2.973e-05', 'epoch': '0.05958', 'num_input_tokens_seen': 998936, 'train_runtime': '510.7', 'train_tokens_per_second': '1956'}
+{'loss': '1.316', 'grad_norm': '1.445', 'learning_rate': '2.979e-05', 'epoch': '0.05971', 'num_input_tokens_seen': 1000983, 'train_runtime': '511.8', 'train_tokens_per_second': '1956'}
+{'loss': '0.931', 'grad_norm': '1.468', 'learning_rate': '2.985e-05', 'epoch': '0.05983', 'num_input_tokens_seen': 1003030, 'train_runtime': '512.8', 'train_tokens_per_second': '1956'}
+{'loss': '1.217', 'grad_norm': '1.58', 'learning_rate': '2.991e-05', 'epoch': '0.05995', 'num_input_tokens_seen': 1005077, 'train_runtime': '513.9', 'train_tokens_per_second': '1956'}
+{'loss': '0.6076', 'grad_norm': '0.9179', 'learning_rate': '2.998e-05', 'epoch': '0.06007', 'num_input_tokens_seen': 1007124, 'train_runtime': '514.9', 'train_tokens_per_second': '1956'}
+{'loss': '1.361', 'grad_norm': '1.476', 'learning_rate': '3.004e-05', 'epoch': '0.0602', 'num_input_tokens_seen': 1009171, 'train_runtime': '515.9', 'train_tokens_per_second': '1956'}
+{'loss': '1.606', 'grad_norm': '2.029', 'learning_rate': '3.01e-05', 'epoch': '0.06032', 'num_input_tokens_seen': 1011218, 'train_runtime': '517', 'train_tokens_per_second': '1956'}
+{'loss': '1.173', 'grad_norm': '1.377', 'learning_rate': '3.016e-05', 'epoch': '0.06044', 'num_input_tokens_seen': 1013265, 'train_runtime': '518', 'train_tokens_per_second': '1956'}
+{'loss': '1.375', 'grad_norm': '1.666', 'learning_rate': '3.022e-05', 'epoch': '0.06056', 'num_input_tokens_seen': 1015312, 'train_runtime': '519.1', 'train_tokens_per_second': '1956'}
+{'loss': '0.9488', 'grad_norm': '1.412', 'learning_rate': '3.028e-05', 'epoch': '0.06068', 'num_input_tokens_seen': 1017359, 'train_runtime': '520.1', 'train_tokens_per_second': '1956'}
+{'loss': '1.203', 'grad_norm': '1.609', 'learning_rate': '3.034e-05', 'epoch': '0.06081', 'num_input_tokens_seen': 1019406, 'train_runtime': '521.2', 'train_tokens_per_second': '1956'}
+{'loss': '1.256', 'grad_norm': '1.467', 'learning_rate': '3.04e-05', 'epoch': '0.06093', 'num_input_tokens_seen': 1021453, 'train_runtime': '522.2', 'train_tokens_per_second': '1956'}
+{'loss': '0.8057', 'grad_norm': '1.074', 'learning_rate': '3.046e-05', 'epoch': '0.06105', 'num_input_tokens_seen': 1023500, 'train_runtime': '523.2', 'train_tokens_per_second': '1956'}
+{'loss': '0.6386', 'grad_norm': '1.053', 'learning_rate': '3.053e-05', 'epoch': '0.06117', 'num_input_tokens_seen': 1025547, 'train_runtime': '524.3', 'train_tokens_per_second': '1956'}
+{'loss': '1.182', 'grad_norm': '1.721', 'learning_rate': '3.059e-05', 'epoch': '0.06129', 'num_input_tokens_seen': 1027594, 'train_runtime': '525.3', 'train_tokens_per_second': '1956'}
+{'loss': '1.169', 'grad_norm': '1.475', 'learning_rate': '3.065e-05', 'epoch': '0.06142', 'num_input_tokens_seen': 1029641, 'train_runtime': '526.4', 'train_tokens_per_second': '1956'}
+{'loss': '1.049', 'grad_norm': '1.367', 'learning_rate': '3.071e-05', 'epoch': '0.06154', 'num_input_tokens_seen': 1031688, 'train_runtime': '527.4', 'train_tokens_per_second': '1956'}
+{'loss': '1.287', 'grad_norm': '2.022', 'learning_rate': '3.077e-05', 'epoch': '0.06166', 'num_input_tokens_seen': 1033735, 'train_runtime': '528.4', 'train_tokens_per_second': '1956'}
+{'loss': '0.7925', 'grad_norm': '0.938', 'learning_rate': '3.083e-05', 'epoch': '0.06178', 'num_input_tokens_seen': 1035782, 'train_runtime': '529.5', 'train_tokens_per_second': '1956'}
+{'loss': '1.264', 'grad_norm': '1.644', 'learning_rate': '3.089e-05', 'epoch': '0.0619', 'num_input_tokens_seen': 1037829, 'train_runtime': '530.5', 'train_tokens_per_second': '1956'}
+{'loss': '1.011', 'grad_norm': '1.444', 'learning_rate': '3.095e-05', 'epoch': '0.06203', 'num_input_tokens_seen': 1039876, 'train_runtime': '531.6', 'train_tokens_per_second': '1956'}
+{'loss': '0.6263', 'grad_norm': '1.05', 'learning_rate': '3.101e-05', 'epoch': '0.06215', 'num_input_tokens_seen': 1041923, 'train_runtime': '532.6', 'train_tokens_per_second': '1956'}
+{'loss': '0.9867', 'grad_norm': '1.71', 'learning_rate': '3.107e-05', 'epoch': '0.06227', 'num_input_tokens_seen': 1043970, 'train_runtime': '533.7', 'train_tokens_per_second': '1956'}
+{'loss': '1.68', 'grad_norm': '1.966', 'learning_rate': '3.114e-05', 'epoch': '0.06239', 'num_input_tokens_seen': 1046017, 'train_runtime': '534.7', 'train_tokens_per_second': '1956'}
+{'loss': '0.9488', 'grad_norm': '1.145', 'learning_rate': '3.12e-05', 'epoch': '0.06252', 'num_input_tokens_seen': 1048064, 'train_runtime': '535.8', 'train_tokens_per_second': '1956'}
+{'loss': '0.6286', 'grad_norm': '1.166', 'learning_rate': '3.126e-05', 'epoch': '0.06264', 'num_input_tokens_seen': 1050111, 'train_runtime': '536.8', 'train_tokens_per_second': '1956'}
+{'loss': '1.126', 'grad_norm': '1.507', 'learning_rate': '3.132e-05', 'epoch': '0.06276', 'num_input_tokens_seen': 1052158, 'train_runtime': '537.8', 'train_tokens_per_second': '1956'}
+{'loss': '1.03', 'grad_norm': '1.194', 'learning_rate': '3.138e-05', 'epoch': '0.06288', 'num_input_tokens_seen': 1054205, 'train_runtime': '538.9', 'train_tokens_per_second': '1956'}
+{'loss': '1.182', 'grad_norm': '1.623', 'learning_rate': '3.144e-05', 'epoch': '0.063', 'num_input_tokens_seen': 1056252, 'train_runtime': '539.9', 'train_tokens_per_second': '1956'}
+{'loss': '0.6433', 'grad_norm': '1.236', 'learning_rate': '3.15e-05', 'epoch': '0.06313', 'num_input_tokens_seen': 1058299, 'train_runtime': '541', 'train_tokens_per_second': '1956'}
+{'loss': '1.125', 'grad_norm': '1.552', 'learning_rate': '3.156e-05', 'epoch': '0.06325', 'num_input_tokens_seen': 1060346, 'train_runtime': '542', 'train_tokens_per_second': '1956'}
+{'loss': '1.383', 'grad_norm': '1.723', 'learning_rate': '3.162e-05', 'epoch': '0.06337', 'num_input_tokens_seen': 1062393, 'train_runtime': '543', 'train_tokens_per_second': '1956'}
+{'loss': '0.9743', 'grad_norm': '1.448', 'learning_rate': '3.168e-05', 'epoch': '0.06349', 'num_input_tokens_seen': 1064440, 'train_runtime': '544.1', 'train_tokens_per_second': '1956'}
+{'loss': '0.9158', 'grad_norm': '1.134', 'learning_rate': '3.175e-05', 'epoch': '0.06361', 'num_input_tokens_seen': 1066487, 'train_runtime': '545.1', 'train_tokens_per_second': '1956'}
+{'loss': '1.231', 'grad_norm': '1.534', 'learning_rate': '3.181e-05', 'epoch': '0.06374', 'num_input_tokens_seen': 1068534, 'train_runtime': '546.2', 'train_tokens_per_second': '1956'}
+{'loss': '1.271', 'grad_norm': '1.669', 'learning_rate': '3.187e-05', 'epoch': '0.06386', 'num_input_tokens_seen': 1070581, 'train_runtime': '547.2', 'train_tokens_per_second': '1956'}
+{'loss': '1.041', 'grad_norm': '1.514', 'learning_rate': '3.193e-05', 'epoch': '0.06398', 'num_input_tokens_seen': 1072628, 'train_runtime': '548.3', 'train_tokens_per_second': '1956'}
+{'loss': '0.5798', 'grad_norm': '1.165', 'learning_rate': '3.199e-05', 'epoch': '0.0641', 'num_input_tokens_seen': 1074675, 'train_runtime': '549.3', 'train_tokens_per_second': '1956'}
+{'loss': '1.131', 'grad_norm': '1.551', 'learning_rate': '3.205e-05', 'epoch': '0.06422', 'num_input_tokens_seen': 1076722, 'train_runtime': '550.3', 'train_tokens_per_second': '1956'}
+{'loss': '1.048', 'grad_norm': '1.223', 'learning_rate': '3.211e-05', 'epoch': '0.06435', 'num_input_tokens_seen': 1078769, 'train_runtime': '551.4', 'train_tokens_per_second': '1956'}
+{'loss': '0.8503', 'grad_norm': '1.326', 'learning_rate': '3.217e-05', 'epoch': '0.06447', 'num_input_tokens_seen': 1080816, 'train_runtime': '552.4', 'train_tokens_per_second': '1956'}
+{'loss': '1.131', 'grad_norm': '1.568', 'learning_rate': '3.223e-05', 'epoch': '0.06459', 'num_input_tokens_seen': 1082863, 'train_runtime': '553.5', 'train_tokens_per_second': '1956'}
+{'loss': '1.454', 'grad_norm': '1.989', 'learning_rate': '3.23e-05', 'epoch': '0.06471', 'num_input_tokens_seen': 1084910, 'train_runtime': '554.5', 'train_tokens_per_second': '1956'}
+{'loss': '0.6111', 'grad_norm': '1.125', 'learning_rate': '3.236e-05', 'epoch': '0.06484', 'num_input_tokens_seen': 1086957, 'train_runtime': '555.6', 'train_tokens_per_second': '1956'}
+{'loss': '0.8443', 'grad_norm': '1.18', 'learning_rate': '3.242e-05', 'epoch': '0.06496', 'num_input_tokens_seen': 1089004, 'train_runtime': '556.6', 'train_tokens_per_second': '1956'}
+{'loss': '1.127', 'grad_norm': '1.569', 'learning_rate': '3.248e-05', 'epoch': '0.06508', 'num_input_tokens_seen': 1091051, 'train_runtime': '557.7', 'train_tokens_per_second': '1956'}
+{'loss': '1.16', 'grad_norm': '1.6', 'learning_rate': '3.254e-05', 'epoch': '0.0652', 'num_input_tokens_seen': 1093098, 'train_runtime': '558.7', 'train_tokens_per_second': '1957'}
+{'loss': '1.189', 'grad_norm': '1.776', 'learning_rate': '3.26e-05', 'epoch': '0.06532', 'num_input_tokens_seen': 1095145, 'train_runtime': '559.7', 'train_tokens_per_second': '1957'}
+{'loss': '1.269', 'grad_norm': '1.873', 'learning_rate': '3.266e-05', 'epoch': '0.06545', 'num_input_tokens_seen': 1097192, 'train_runtime': '560.8', 'train_tokens_per_second': '1957'}
+{'loss': '1.209', 'grad_norm': '1.636', 'learning_rate': '3.272e-05', 'epoch': '0.06557', 'num_input_tokens_seen': 1099239, 'train_runtime': '561.8', 'train_tokens_per_second': '1957'}
+{'loss': '0.6367', 'grad_norm': '0.9915', 'learning_rate': '3.278e-05', 'epoch': '0.06569', 'num_input_tokens_seen': 1101286, 'train_runtime': '562.9', 'train_tokens_per_second': '1957'}
+{'loss': '1.167', 'grad_norm': '1.482', 'learning_rate': '3.284e-05', 'epoch': '0.06581', 'num_input_tokens_seen': 1103333, 'train_runtime': '563.9', 'train_tokens_per_second': '1957'}
+{'loss': '1.131', 'grad_norm': '1.616', 'learning_rate': '3.291e-05', 'epoch': '0.06593', 'num_input_tokens_seen': 1105380, 'train_runtime': '564.9', 'train_tokens_per_second': '1957'}
+{'loss': '0.529', 'grad_norm': '1.02', 'learning_rate': '3.297e-05', 'epoch': '0.06606', 'num_input_tokens_seen': 1107427, 'train_runtime': '566', 'train_tokens_per_second': '1957'}
+{'loss': '0.6785', 'grad_norm': '1.182', 'learning_rate': '3.303e-05', 'epoch': '0.06618', 'num_input_tokens_seen': 1109474, 'train_runtime': '567', 'train_tokens_per_second': '1957'}
+{'loss': '1.03', 'grad_norm': '1.896', 'learning_rate': '3.309e-05', 'epoch': '0.0663', 'num_input_tokens_seen': 1111521, 'train_runtime': '568.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.9816', 'grad_norm': '1.441', 'learning_rate': '3.315e-05', 'epoch': '0.06642', 'num_input_tokens_seen': 1113568, 'train_runtime': '569.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.9641', 'grad_norm': '1.868', 'learning_rate': '3.321e-05', 'epoch': '0.06654', 'num_input_tokens_seen': 1115615, 'train_runtime': '570.2', 'train_tokens_per_second': '1957'}
+{'loss': '1.222', 'grad_norm': '1.788', 'learning_rate': '3.327e-05', 'epoch': '0.06667', 'num_input_tokens_seen': 1117662, 'train_runtime': '571.2', 'train_tokens_per_second': '1957'}
+{'loss': '1.073', 'grad_norm': '1.482', 'learning_rate': '3.333e-05', 'epoch': '0.06679', 'num_input_tokens_seen': 1119709, 'train_runtime': '572.3', 'train_tokens_per_second': '1957'}
+{'loss': '1.241', 'grad_norm': '1.697', 'learning_rate': '3.339e-05', 'epoch': '0.06691', 'num_input_tokens_seen': 1121756, 'train_runtime': '573.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.9652', 'grad_norm': '1.58', 'learning_rate': '3.346e-05', 'epoch': '0.06703', 'num_input_tokens_seen': 1123803, 'train_runtime': '574.4', 'train_tokens_per_second': '1957'}
+{'loss': '1.028', 'grad_norm': '1.696', 'learning_rate': '3.352e-05', 'epoch': '0.06716', 'num_input_tokens_seen': 1125850, 'train_runtime': '575.4', 'train_tokens_per_second': '1957'}
+{'loss': '0.9986', 'grad_norm': '1.511', 'learning_rate': '3.358e-05', 'epoch': '0.06728', 'num_input_tokens_seen': 1127897, 'train_runtime': '576.4', 'train_tokens_per_second': '1957'}
+{'loss': '1.319', 'grad_norm': '1.518', 'learning_rate': '3.364e-05', 'epoch': '0.0674', 'num_input_tokens_seen': 1129944, 'train_runtime': '577.5', 'train_tokens_per_second': '1957'}
+{'loss': '0.897', 'grad_norm': '1.46', 'learning_rate': '3.37e-05', 'epoch': '0.06752', 'num_input_tokens_seen': 1131991, 'train_runtime': '578.5', 'train_tokens_per_second': '1957'}
+{'loss': '0.9635', 'grad_norm': '1.779', 'learning_rate': '3.376e-05', 'epoch': '0.06764', 'num_input_tokens_seen': 1134038, 'train_runtime': '579.6', 'train_tokens_per_second': '1957'}
+{'loss': '1.064', 'grad_norm': '1.582', 'learning_rate': '3.382e-05', 'epoch': '0.06777', 'num_input_tokens_seen': 1136085, 'train_runtime': '580.6', 'train_tokens_per_second': '1957'}
+{'loss': '1.218', 'grad_norm': '1.636', 'learning_rate': '3.388e-05', 'epoch': '0.06789', 'num_input_tokens_seen': 1138132, 'train_runtime': '581.7', 'train_tokens_per_second': '1957'}
+{'loss': '0.6134', 'grad_norm': '1.145', 'learning_rate': '3.394e-05', 'epoch': '0.06801', 'num_input_tokens_seen': 1140179, 'train_runtime': '582.7', 'train_tokens_per_second': '1957'}
+{'loss': '1.497', 'grad_norm': '1.673', 'learning_rate': '3.4e-05', 'epoch': '0.06813', 'num_input_tokens_seen': 1142226, 'train_runtime': '583.8', 'train_tokens_per_second': '1957'}
+{'loss': '0.7189', 'grad_norm': '1.162', 'learning_rate': '3.407e-05', 'epoch': '0.06825', 'num_input_tokens_seen': 1144273, 'train_runtime': '584.8', 'train_tokens_per_second': '1957'}
+{'loss': '0.5028', 'grad_norm': '1.055', 'learning_rate': '3.413e-05', 'epoch': '0.06838', 'num_input_tokens_seen': 1146320, 'train_runtime': '585.8', 'train_tokens_per_second': '1957'}
+{'loss': '1.229', 'grad_norm': '1.98', 'learning_rate': '3.419e-05', 'epoch': '0.0685', 'num_input_tokens_seen': 1148367, 'train_runtime': '586.9', 'train_tokens_per_second': '1957'}
+{'loss': '1.009', 'grad_norm': '1.25', 'learning_rate': '3.425e-05', 'epoch': '0.06862', 'num_input_tokens_seen': 1150414, 'train_runtime': '587.9', 'train_tokens_per_second': '1957'}
+{'loss': '0.9904', 'grad_norm': '1.605', 'learning_rate': '3.431e-05', 'epoch': '0.06874', 'num_input_tokens_seen': 1152461, 'train_runtime': '589', 'train_tokens_per_second': '1957'}
+{'loss': '1.017', 'grad_norm': '1.482', 'learning_rate': '3.437e-05', 'epoch': '0.06886', 'num_input_tokens_seen': 1154508, 'train_runtime': '590', 'train_tokens_per_second': '1957'}
+{'loss': '0.9522', 'grad_norm': '1.409', 'learning_rate': '3.443e-05', 'epoch': '0.06899', 'num_input_tokens_seen': 1156555, 'train_runtime': '591.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.5172', 'grad_norm': '1.015', 'learning_rate': '3.449e-05', 'epoch': '0.06911', 'num_input_tokens_seen': 1158602, 'train_runtime': '592.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.6328', 'grad_norm': '1.063', 'learning_rate': '3.455e-05', 'epoch': '0.06923', 'num_input_tokens_seen': 1160649, 'train_runtime': '593.1', 'train_tokens_per_second': '1957'}
+{'loss': '1.169', 'grad_norm': '1.666', 'learning_rate': '3.462e-05', 'epoch': '0.06935', 'num_input_tokens_seen': 1162696, 'train_runtime': '594.2', 'train_tokens_per_second': '1957'}
+{'loss': '0.508', 'grad_norm': '1.037', 'learning_rate': '3.468e-05', 'epoch': '0.06947', 'num_input_tokens_seen': 1164743, 'train_runtime': '595.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.5144', 'grad_norm': '1.194', 'learning_rate': '3.474e-05', 'epoch': '0.0696', 'num_input_tokens_seen': 1166790, 'train_runtime': '596.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.631', 'grad_norm': '1.147', 'learning_rate': '3.48e-05', 'epoch': '0.06972', 'num_input_tokens_seen': 1168837, 'train_runtime': '597.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.8113', 'grad_norm': '1.345', 'learning_rate': '3.486e-05', 'epoch': '0.06984', 'num_input_tokens_seen': 1170884, 'train_runtime': '598.4', 'train_tokens_per_second': '1957'}
+{'loss': '0.8212', 'grad_norm': '1.201', 'learning_rate': '3.492e-05', 'epoch': '0.06996', 'num_input_tokens_seen': 1172931, 'train_runtime': '599.4', 'train_tokens_per_second': '1957'}
+{'loss': '1.157', 'grad_norm': '1.747', 'learning_rate': '3.498e-05', 'epoch': '0.07009', 'num_input_tokens_seen': 1174978, 'train_runtime': '600.5', 'train_tokens_per_second': '1957'}
+{'loss': '0.5543', 'grad_norm': '1.086', 'learning_rate': '3.504e-05', 'epoch': '0.07021', 'num_input_tokens_seen': 1177025, 'train_runtime': '601.5', 'train_tokens_per_second': '1957'}
+{'loss': '1.31', 'grad_norm': '2.037', 'learning_rate': '3.51e-05', 'epoch': '0.07033', 'num_input_tokens_seen': 1179072, 'train_runtime': '602.6', 'train_tokens_per_second': '1957'}
+{'loss': '0.9028', 'grad_norm': '1.655', 'learning_rate': '3.516e-05', 'epoch': '0.07045', 'num_input_tokens_seen': 1181119, 'train_runtime': '603.6', 'train_tokens_per_second': '1957'}
+{'loss': '0.7332', 'grad_norm': '1.451', 'learning_rate': '3.523e-05', 'epoch': '0.07057', 'num_input_tokens_seen': 1183166, 'train_runtime': '604.7', 'train_tokens_per_second': '1957'}
+{'loss': '0.6048', 'grad_norm': '1.26', 'learning_rate': '3.529e-05', 'epoch': '0.0707', 'num_input_tokens_seen': 1185213, 'train_runtime': '605.7', 'train_tokens_per_second': '1957'}
+{'loss': '1.008', 'grad_norm': '1.865', 'learning_rate': '3.535e-05', 'epoch': '0.07082', 'num_input_tokens_seen': 1187260, 'train_runtime': '606.8', 'train_tokens_per_second': '1957'}
+{'loss': '1.108', 'grad_norm': '1.805', 'learning_rate': '3.541e-05', 'epoch': '0.07094', 'num_input_tokens_seen': 1189307, 'train_runtime': '607.8', 'train_tokens_per_second': '1957'}
+{'loss': '0.8695', 'grad_norm': '1.579', 'learning_rate': '3.547e-05', 'epoch': '0.07106', 'num_input_tokens_seen': 1191354, 'train_runtime': '608.8', 'train_tokens_per_second': '1957'}
+{'loss': '0.4635', 'grad_norm': '1.072', 'learning_rate': '3.553e-05', 'epoch': '0.07118', 'num_input_tokens_seen': 1193401, 'train_runtime': '609.9', 'train_tokens_per_second': '1957'}
+{'loss': '1.182', 'grad_norm': '1.821', 'learning_rate': '3.559e-05', 'epoch': '0.07131', 'num_input_tokens_seen': 1195448, 'train_runtime': '610.9', 'train_tokens_per_second': '1957'}
+{'loss': '0.9705', 'grad_norm': '1.627', 'learning_rate': '3.565e-05', 'epoch': '0.07143', 'num_input_tokens_seen': 1197495, 'train_runtime': '612', 'train_tokens_per_second': '1957'}
+{'loss': '1.371', 'grad_norm': '1.734', 'learning_rate': '3.571e-05', 'epoch': '0.07155', 'num_input_tokens_seen': 1199542, 'train_runtime': '613', 'train_tokens_per_second': '1957'}
+{'loss': '0.5607', 'grad_norm': '1.183', 'learning_rate': '3.578e-05', 'epoch': '0.07167', 'num_input_tokens_seen': 1201589, 'train_runtime': '614.1', 'train_tokens_per_second': '1957'}
+{'loss': '1.276', 'grad_norm': '1.557', 'learning_rate': '3.584e-05', 'epoch': '0.07179', 'num_input_tokens_seen': 1203636, 'train_runtime': '615.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.7013', 'grad_norm': '1.357', 'learning_rate': '3.59e-05', 'epoch': '0.07192', 'num_input_tokens_seen': 1205683, 'train_runtime': '616.1', 'train_tokens_per_second': '1957'}
+{'loss': '1.12', 'grad_norm': '1.484', 'learning_rate': '3.596e-05', 'epoch': '0.07204', 'num_input_tokens_seen': 1207730, 'train_runtime': '617.2', 'train_tokens_per_second': '1957'}
+{'loss': '1.428', 'grad_norm': '1.914', 'learning_rate': '3.602e-05', 'epoch': '0.07216', 'num_input_tokens_seen': 1209777, 'train_runtime': '618.2', 'train_tokens_per_second': '1957'}
+{'loss': '1.115', 'grad_norm': '1.661', 'learning_rate': '3.608e-05', 'epoch': '0.07228', 'num_input_tokens_seen': 1211824, 'train_runtime': '619.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.5601', 'grad_norm': '1.337', 'learning_rate': '3.614e-05', 'epoch': '0.07241', 'num_input_tokens_seen': 1213871, 'train_runtime': '620.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.9062', 'grad_norm': '1.386', 'learning_rate': '3.62e-05', 'epoch': '0.07253', 'num_input_tokens_seen': 1215918, 'train_runtime': '621.4', 'train_tokens_per_second': '1957'}
+{'loss': '0.8536', 'grad_norm': '1.709', 'learning_rate': '3.626e-05', 'epoch': '0.07265', 'num_input_tokens_seen': 1217965, 'train_runtime': '622.4', 'train_tokens_per_second': '1957'}
+{'loss': '1.088', 'grad_norm': '1.932', 'learning_rate': '3.632e-05', 'epoch': '0.07277', 'num_input_tokens_seen': 1220012, 'train_runtime': '623.5', 'train_tokens_per_second': '1957'}
+{'loss': '0.9508', 'grad_norm': '1.675', 'learning_rate': '3.639e-05', 'epoch': '0.07289', 'num_input_tokens_seen': 1222059, 'train_runtime': '624.5', 'train_tokens_per_second': '1957'}
+{'loss': '0.5207', 'grad_norm': '1.481', 'learning_rate': '3.645e-05', 'epoch': '0.07302', 'num_input_tokens_seen': 1224106, 'train_runtime': '625.5', 'train_tokens_per_second': '1957'}
+{'loss': '1.069', 'grad_norm': '1.778', 'learning_rate': '3.651e-05', 'epoch': '0.07314', 'num_input_tokens_seen': 1226153, 'train_runtime': '626.6', 'train_tokens_per_second': '1957'}
+{'loss': '0.8132', 'grad_norm': '1.633', 'learning_rate': '3.657e-05', 'epoch': '0.07326', 'num_input_tokens_seen': 1228200, 'train_runtime': '627.6', 'train_tokens_per_second': '1957'}
+{'loss': '0.9037', 'grad_norm': '1.671', 'learning_rate': '3.663e-05', 'epoch': '0.07338', 'num_input_tokens_seen': 1230247, 'train_runtime': '628.7', 'train_tokens_per_second': '1957'}
+{'loss': '0.9926', 'grad_norm': '1.908', 'learning_rate': '3.669e-05', 'epoch': '0.0735', 'num_input_tokens_seen': 1232294, 'train_runtime': '629.7', 'train_tokens_per_second': '1957'}
+{'loss': '1.225', 'grad_norm': '1.868', 'learning_rate': '3.675e-05', 'epoch': '0.07363', 'num_input_tokens_seen': 1234341, 'train_runtime': '630.8', 'train_tokens_per_second': '1957'}
+{'loss': '1.355', 'grad_norm': '2.077', 'learning_rate': '3.681e-05', 'epoch': '0.07375', 'num_input_tokens_seen': 1236388, 'train_runtime': '631.8', 'train_tokens_per_second': '1957'}
+{'loss': '0.6866', 'grad_norm': '1.425', 'learning_rate': '3.687e-05', 'epoch': '0.07387', 'num_input_tokens_seen': 1238435, 'train_runtime': '632.8', 'train_tokens_per_second': '1957'}
+{'loss': '1.725', 'grad_norm': '2.401', 'learning_rate': '3.694e-05', 'epoch': '0.07399', 'num_input_tokens_seen': 1240482, 'train_runtime': '633.9', 'train_tokens_per_second': '1957'}
+{'loss': '0.7308', 'grad_norm': '1.593', 'learning_rate': '3.7e-05', 'epoch': '0.07411', 'num_input_tokens_seen': 1242529, 'train_runtime': '634.9', 'train_tokens_per_second': '1957'}
+{'loss': '1.121', 'grad_norm': '1.832', 'learning_rate': '3.706e-05', 'epoch': '0.07424', 'num_input_tokens_seen': 1244576, 'train_runtime': '636', 'train_tokens_per_second': '1957'}
+{'loss': '0.5074', 'grad_norm': '1.329', 'learning_rate': '3.712e-05', 'epoch': '0.07436', 'num_input_tokens_seen': 1246623, 'train_runtime': '637', 'train_tokens_per_second': '1957'}
+{'loss': '0.6383', 'grad_norm': '2.008', 'learning_rate': '3.718e-05', 'epoch': '0.07448', 'num_input_tokens_seen': 1248670, 'train_runtime': '638.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.6861', 'grad_norm': '1.366', 'learning_rate': '3.724e-05', 'epoch': '0.0746', 'num_input_tokens_seen': 1250717, 'train_runtime': '639.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.8565', 'grad_norm': '1.605', 'learning_rate': '3.73e-05', 'epoch': '0.07473', 'num_input_tokens_seen': 1252764, 'train_runtime': '640.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.6336', 'grad_norm': '1.278', 'learning_rate': '3.736e-05', 'epoch': '0.07485', 'num_input_tokens_seen': 1254811, 'train_runtime': '641.2', 'train_tokens_per_second': '1957'}
+{'loss': '0.5593', 'grad_norm': '1.345', 'learning_rate': '3.742e-05', 'epoch': '0.07497', 'num_input_tokens_seen': 1256858, 'train_runtime': '642.2', 'train_tokens_per_second': '1957'}
+{'loss': '1.089', 'grad_norm': '1.422', 'learning_rate': '3.748e-05', 'epoch': '0.07509', 'num_input_tokens_seen': 1258905, 'train_runtime': '643.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.574', 'grad_norm': '1.474', 'learning_rate': '3.755e-05', 'epoch': '0.07521', 'num_input_tokens_seen': 1260952, 'train_runtime': '644.3', 'train_tokens_per_second': '1957'}
+{'loss': '1.341', 'grad_norm': '1.802', 'learning_rate': '3.761e-05', 'epoch': '0.07534', 'num_input_tokens_seen': 1262999, 'train_runtime': '645.4', 'train_tokens_per_second': '1957'}
+{'loss': '1.311', 'grad_norm': '2.155', 'learning_rate': '3.767e-05', 'epoch': '0.07546', 'num_input_tokens_seen': 1265046, 'train_runtime': '646.4', 'train_tokens_per_second': '1957'}
+{'loss': '1.257', 'grad_norm': '1.79', 'learning_rate': '3.773e-05', 'epoch': '0.07558', 'num_input_tokens_seen': 1267093, 'train_runtime': '647.4', 'train_tokens_per_second': '1957'}
+{'loss': '1.254', 'grad_norm': '2.41', 'learning_rate': '3.779e-05', 'epoch': '0.0757', 'num_input_tokens_seen': 1269140, 'train_runtime': '648.5', 'train_tokens_per_second': '1957'}
+{'loss': '1.255', 'grad_norm': '1.597', 'learning_rate': '3.785e-05', 'epoch': '0.07582', 'num_input_tokens_seen': 1271187, 'train_runtime': '649.5', 'train_tokens_per_second': '1957'}
+{'loss': '1.152', 'grad_norm': '1.825', 'learning_rate': '3.791e-05', 'epoch': '0.07595', 'num_input_tokens_seen': 1273234, 'train_runtime': '650.6', 'train_tokens_per_second': '1957'}
+{'loss': '7.132', 'grad_norm': '158.8', 'learning_rate': '3.797e-05', 'epoch': '0.07607', 'num_input_tokens_seen': 1275281, 'train_runtime': '651.6', 'train_tokens_per_second': '1957'}
+{'loss': '0.468', 'grad_norm': '1.003', 'learning_rate': '3.803e-05', 'epoch': '0.07619', 'num_input_tokens_seen': 1277328, 'train_runtime': '652.6', 'train_tokens_per_second': '1957'}
+{'loss': '1.215', 'grad_norm': '1.623', 'learning_rate': '3.81e-05', 'epoch': '0.07631', 'num_input_tokens_seen': 1279375, 'train_runtime': '653.7', 'train_tokens_per_second': '1957'}
+{'loss': '1.344', 'grad_norm': '1.723', 'learning_rate': '3.816e-05', 'epoch': '0.07643', 'num_input_tokens_seen': 1281422, 'train_runtime': '654.7', 'train_tokens_per_second': '1957'}
+{'loss': '0.6123', 'grad_norm': '1.186', 'learning_rate': '3.822e-05', 'epoch': '0.07656', 'num_input_tokens_seen': 1283469, 'train_runtime': '655.8', 'train_tokens_per_second': '1957'}
+{'loss': '0.4808', 'grad_norm': '1.134', 'learning_rate': '3.828e-05', 'epoch': '0.07668', 'num_input_tokens_seen': 1285516, 'train_runtime': '656.8', 'train_tokens_per_second': '1957'}
+{'loss': '1.096', 'grad_norm': '1.858', 'learning_rate': '3.834e-05', 'epoch': '0.0768', 'num_input_tokens_seen': 1287563, 'train_runtime': '657.9', 'train_tokens_per_second': '1957'}
+{'loss': '0.9738', 'grad_norm': '1.28', 'learning_rate': '3.84e-05', 'epoch': '0.07692', 'num_input_tokens_seen': 1289610, 'train_runtime': '658.9', 'train_tokens_per_second': '1957'}
+{'loss': '1.182', 'grad_norm': '1.775', 'learning_rate': '3.846e-05', 'epoch': '0.07705', 'num_input_tokens_seen': 1291657, 'train_runtime': '659.9', 'train_tokens_per_second': '1957'}
+{'loss': '0.4246', 'grad_norm': '1.099', 'learning_rate': '3.852e-05', 'epoch': '0.07717', 'num_input_tokens_seen': 1293704, 'train_runtime': '661.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.8505', 'grad_norm': '1.432', 'learning_rate': '3.858e-05', 'epoch': '0.07729', 'num_input_tokens_seen': 1295751, 'train_runtime': '662.1', 'train_tokens_per_second': '1957'}
+{'loss': '1.461', 'grad_norm': '2.153', 'learning_rate': '3.864e-05', 'epoch': '0.07741', 'num_input_tokens_seen': 1297798, 'train_runtime': '663.1', 'train_tokens_per_second': '1957'}
+{'loss': '1.082', 'grad_norm': '1.558', 'learning_rate': '3.871e-05', 'epoch': '0.07753', 'num_input_tokens_seen': 1299845, 'train_runtime': '664.2', 'train_tokens_per_second': '1957'}
+{'loss': '0.4175', 'grad_norm': '1.096', 'learning_rate': '3.877e-05', 'epoch': '0.07766', 'num_input_tokens_seen': 1301892, 'train_runtime': '665.2', 'train_tokens_per_second': '1957'}
+{'loss': '1.217', 'grad_norm': '1.992', 'learning_rate': '3.883e-05', 'epoch': '0.07778', 'num_input_tokens_seen': 1303939, 'train_runtime': '666.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.9607', 'grad_norm': '1.953', 'learning_rate': '3.889e-05', 'epoch': '0.0779', 'num_input_tokens_seen': 1305986, 'train_runtime': '667.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.5535', 'grad_norm': '1.4', 'learning_rate': '3.895e-05', 'epoch': '0.07802', 'num_input_tokens_seen': 1308033, 'train_runtime': '668.3', 'train_tokens_per_second': '1957'}
+{'loss': '1.023', 'grad_norm': '1.771', 'learning_rate': '3.901e-05', 'epoch': '0.07814', 'num_input_tokens_seen': 1310080, 'train_runtime': '669.4', 'train_tokens_per_second': '1957'}
+{'loss': '0.5419', 'grad_norm': '1.291', 'learning_rate': '3.907e-05', 'epoch': '0.07827', 'num_input_tokens_seen': 1312127, 'train_runtime': '670.4', 'train_tokens_per_second': '1957'}
+{'loss': '0.5003', 'grad_norm': '1.186', 'learning_rate': '3.913e-05', 'epoch': '0.07839', 'num_input_tokens_seen': 1314174, 'train_runtime': '671.5', 'train_tokens_per_second': '1957'}
+{'loss': '0.6199', 'grad_norm': '1.533', 'learning_rate': '3.919e-05', 'epoch': '0.07851', 'num_input_tokens_seen': 1316221, 'train_runtime': '672.5', 'train_tokens_per_second': '1957'}
+{'loss': '0.9357', 'grad_norm': '1.99', 'learning_rate': '3.926e-05', 'epoch': '0.07863', 'num_input_tokens_seen': 1318268, 'train_runtime': '673.6', 'train_tokens_per_second': '1957'}
+{'loss': '1.326', 'grad_norm': '1.901', 'learning_rate': '3.932e-05', 'epoch': '0.07875', 'num_input_tokens_seen': 1320315, 'train_runtime': '674.6', 'train_tokens_per_second': '1957'}
+{'loss': '0.8821', 'grad_norm': '2.087', 'learning_rate': '3.938e-05', 'epoch': '0.07888', 'num_input_tokens_seen': 1322362, 'train_runtime': '675.6', 'train_tokens_per_second': '1957'}
+{'loss': '0.6495', 'grad_norm': '1.361', 'learning_rate': '3.944e-05', 'epoch': '0.079', 'num_input_tokens_seen': 1324409, 'train_runtime': '676.7', 'train_tokens_per_second': '1957'}
+{'loss': '0.99', 'grad_norm': '2.091', 'learning_rate': '3.95e-05', 'epoch': '0.07912', 'num_input_tokens_seen': 1326456, 'train_runtime': '677.7', 'train_tokens_per_second': '1957'}
+{'loss': '0.5321', 'grad_norm': '1.494', 'learning_rate': '3.956e-05', 'epoch': '0.07924', 'num_input_tokens_seen': 1328503, 'train_runtime': '678.8', 'train_tokens_per_second': '1957'}
+{'loss': '0.6226', 'grad_norm': '1.35', 'learning_rate': '3.962e-05', 'epoch': '0.07937', 'num_input_tokens_seen': 1330550, 'train_runtime': '679.8', 'train_tokens_per_second': '1957'}
+{'loss': '0.7117', 'grad_norm': '1.56', 'learning_rate': '3.968e-05', 'epoch': '0.07949', 'num_input_tokens_seen': 1332597, 'train_runtime': '680.9', 'train_tokens_per_second': '1957'}
+{'loss': '1.412', 'grad_norm': '2.585', 'learning_rate': '3.974e-05', 'epoch': '0.07961', 'num_input_tokens_seen': 1334644, 'train_runtime': '681.9', 'train_tokens_per_second': '1957'}
+{'loss': '1.054', 'grad_norm': '1.913', 'learning_rate': '3.98e-05', 'epoch': '0.07973', 'num_input_tokens_seen': 1336691, 'train_runtime': '683', 'train_tokens_per_second': '1957'}
+{'loss': '0.889', 'grad_norm': '1.832', 'learning_rate': '3.987e-05', 'epoch': '0.07985', 'num_input_tokens_seen': 1338738, 'train_runtime': '684', 'train_tokens_per_second': '1957'}
+{'loss': '1.028', 'grad_norm': '1.964', 'learning_rate': '3.993e-05', 'epoch': '0.07998', 'num_input_tokens_seen': 1340785, 'train_runtime': '685.1', 'train_tokens_per_second': '1957'}
+{'loss': '1.206', 'grad_norm': '2.302', 'learning_rate': '3.999e-05', 'epoch': '0.0801', 'num_input_tokens_seen': 1342832, 'train_runtime': '686.1', 'train_tokens_per_second': '1957'}
+{'loss': '1.125', 'grad_norm': '2.127', 'learning_rate': '4.005e-05', 'epoch': '0.08022', 'num_input_tokens_seen': 1344879, 'train_runtime': '687.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.8502', 'grad_norm': '1.48', 'learning_rate': '4.011e-05', 'epoch': '0.08034', 'num_input_tokens_seen': 1346926, 'train_runtime': '688.2', 'train_tokens_per_second': '1957'}
+{'loss': '1.188', 'grad_norm': '2.005', 'learning_rate': '4.017e-05', 'epoch': '0.08046', 'num_input_tokens_seen': 1348973, 'train_runtime': '689.2', 'train_tokens_per_second': '1957'}
+{'loss': '1.133', 'grad_norm': '2.001', 'learning_rate': '4.023e-05', 'epoch': '0.08059', 'num_input_tokens_seen': 1351020, 'train_runtime': '690.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.6163', 'grad_norm': '1.413', 'learning_rate': '4.029e-05', 'epoch': '0.08071', 'num_input_tokens_seen': 1353067, 'train_runtime': '691.3', 'train_tokens_per_second': '1957'}
+{'loss': '1.27', 'grad_norm': '2.103', 'learning_rate': '4.035e-05', 'epoch': '0.08083', 'num_input_tokens_seen': 1355114, 'train_runtime': '692.3', 'train_tokens_per_second': '1957'}
+{'loss': '1.274', 'grad_norm': '1.93', 'learning_rate': '4.042e-05', 'epoch': '0.08095', 'num_input_tokens_seen': 1357161, 'train_runtime': '693.4', 'train_tokens_per_second': '1957'}
+{'loss': '0.5255', 'grad_norm': '1.358', 'learning_rate': '4.048e-05', 'epoch': '0.08107', 'num_input_tokens_seen': 1359208, 'train_runtime': '694.4', 'train_tokens_per_second': '1957'}
+{'loss': '0.8106', 'grad_norm': '1.585', 'learning_rate': '4.054e-05', 'epoch': '0.0812', 'num_input_tokens_seen': 1361255, 'train_runtime': '695.5', 'train_tokens_per_second': '1957'}
+{'loss': '0.9706', 'grad_norm': '1.666', 'learning_rate': '4.06e-05', 'epoch': '0.08132', 'num_input_tokens_seen': 1363302, 'train_runtime': '696.5', 'train_tokens_per_second': '1957'}
+{'loss': '1.314', 'grad_norm': '2.167', 'learning_rate': '4.066e-05', 'epoch': '0.08144', 'num_input_tokens_seen': 1365349, 'train_runtime': '697.6', 'train_tokens_per_second': '1957'}
+{'loss': '1.318', 'grad_norm': '1.754', 'learning_rate': '4.072e-05', 'epoch': '0.08156', 'num_input_tokens_seen': 1367396, 'train_runtime': '698.6', 'train_tokens_per_second': '1957'}
+{'loss': '0.9575', 'grad_norm': '1.795', 'learning_rate': '4.078e-05', 'epoch': '0.08168', 'num_input_tokens_seen': 1369443, 'train_runtime': '699.7', 'train_tokens_per_second': '1957'}
+{'loss': '1.245', 'grad_norm': '1.501', 'learning_rate': '4.084e-05', 'epoch': '0.08181', 'num_input_tokens_seen': 1371490, 'train_runtime': '700.7', 'train_tokens_per_second': '1957'}
+{'loss': '0.5765', 'grad_norm': '1.375', 'learning_rate': '4.09e-05', 'epoch': '0.08193', 'num_input_tokens_seen': 1373537, 'train_runtime': '701.8', 'train_tokens_per_second': '1957'}
+{'loss': '1.357', 'grad_norm': '1.856', 'learning_rate': '4.096e-05', 'epoch': '0.08205', 'num_input_tokens_seen': 1375584, 'train_runtime': '702.8', 'train_tokens_per_second': '1957'}
+{'loss': '0.4067', 'grad_norm': '1.122', 'learning_rate': '4.103e-05', 'epoch': '0.08217', 'num_input_tokens_seen': 1377631, 'train_runtime': '703.8', 'train_tokens_per_second': '1957'}
+{'loss': '0.494', 'grad_norm': '1.252', 'learning_rate': '4.109e-05', 'epoch': '0.0823', 'num_input_tokens_seen': 1379678, 'train_runtime': '704.9', 'train_tokens_per_second': '1957'}
+{'loss': '0.9153', 'grad_norm': '1.707', 'learning_rate': '4.115e-05', 'epoch': '0.08242', 'num_input_tokens_seen': 1381725, 'train_runtime': '705.9', 'train_tokens_per_second': '1957'}
+{'loss': '1.114', 'grad_norm': '2.161', 'learning_rate': '4.121e-05', 'epoch': '0.08254', 'num_input_tokens_seen': 1383772, 'train_runtime': '707', 'train_tokens_per_second': '1957'}
+{'loss': '0.9297', 'grad_norm': '1.828', 'learning_rate': '4.127e-05', 'epoch': '0.08266', 'num_input_tokens_seen': 1385819, 'train_runtime': '708', 'train_tokens_per_second': '1957'}
+{'loss': '0.5989', 'grad_norm': '1.527', 'learning_rate': '4.133e-05', 'epoch': '0.08278', 'num_input_tokens_seen': 1387866, 'train_runtime': '709.1', 'train_tokens_per_second': '1957'}
+{'loss': '1.183', 'grad_norm': '2.54', 'learning_rate': '4.139e-05', 'epoch': '0.08291', 'num_input_tokens_seen': 1389913, 'train_runtime': '710.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.9744', 'grad_norm': '1.924', 'learning_rate': '4.145e-05', 'epoch': '0.08303', 'num_input_tokens_seen': 1391960, 'train_runtime': '711.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.7705', 'grad_norm': '1.457', 'learning_rate': '4.151e-05', 'epoch': '0.08315', 'num_input_tokens_seen': 1394007, 'train_runtime': '712.2', 'train_tokens_per_second': '1957'}
+{'loss': '0.9755', 'grad_norm': '2.062', 'learning_rate': '4.158e-05', 'epoch': '0.08327', 'num_input_tokens_seen': 1396054, 'train_runtime': '713.2', 'train_tokens_per_second': '1957'}
+{'loss': '0.5344', 'grad_norm': '1.387', 'learning_rate': '4.164e-05', 'epoch': '0.08339', 'num_input_tokens_seen': 1398101, 'train_runtime': '714.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.8974', 'grad_norm': '1.606', 'learning_rate': '4.17e-05', 'epoch': '0.08352', 'num_input_tokens_seen': 1400148, 'train_runtime': '715.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.6198', 'grad_norm': '1.334', 'learning_rate': '4.176e-05', 'epoch': '0.08364', 'num_input_tokens_seen': 1402195, 'train_runtime': '716.4', 'train_tokens_per_second': '1957'}
+{'loss': '1.053', 'grad_norm': '2.106', 'learning_rate': '4.182e-05', 'epoch': '0.08376', 'num_input_tokens_seen': 1404242, 'train_runtime': '717.4', 'train_tokens_per_second': '1957'}
+{'loss': '0.8598', 'grad_norm': '1.504', 'learning_rate': '4.188e-05', 'epoch': '0.08388', 'num_input_tokens_seen': 1406289, 'train_runtime': '718.4', 'train_tokens_per_second': '1957'}
+{'loss': '1.205', 'grad_norm': '2.296', 'learning_rate': '4.194e-05', 'epoch': '0.084', 'num_input_tokens_seen': 1408336, 'train_runtime': '719.5', 'train_tokens_per_second': '1957'}
+{'loss': '1.072', 'grad_norm': '2.085', 'learning_rate': '4.2e-05', 'epoch': '0.08413', 'num_input_tokens_seen': 1410383, 'train_runtime': '720.5', 'train_tokens_per_second': '1957'}
+{'loss': '0.8908', 'grad_norm': '1.679', 'learning_rate': '4.206e-05', 'epoch': '0.08425', 'num_input_tokens_seen': 1412430, 'train_runtime': '721.6', 'train_tokens_per_second': '1957'}
+{'loss': '1.133', 'grad_norm': '1.932', 'learning_rate': '4.212e-05', 'epoch': '0.08437', 'num_input_tokens_seen': 1414477, 'train_runtime': '722.6', 'train_tokens_per_second': '1957'}
+{'loss': '1.193', 'grad_norm': '2.009', 'learning_rate': '4.219e-05', 'epoch': '0.08449', 'num_input_tokens_seen': 1416524, 'train_runtime': '723.7', 'train_tokens_per_second': '1957'}
+{'loss': '1.036', 'grad_norm': '2.02', 'learning_rate': '4.225e-05', 'epoch': '0.08462', 'num_input_tokens_seen': 1418571, 'train_runtime': '724.7', 'train_tokens_per_second': '1957'}
+{'loss': '0.4755', 'grad_norm': '1.479', 'learning_rate': '4.231e-05', 'epoch': '0.08474', 'num_input_tokens_seen': 1420618, 'train_runtime': '725.8', 'train_tokens_per_second': '1957'}
+{'loss': '1.038', 'grad_norm': '2.505', 'learning_rate': '4.237e-05', 'epoch': '0.08486', 'num_input_tokens_seen': 1422665, 'train_runtime': '726.8', 'train_tokens_per_second': '1957'}
+{'loss': '0.9895', 'grad_norm': '2.002', 'learning_rate': '4.243e-05', 'epoch': '0.08498', 'num_input_tokens_seen': 1424712, 'train_runtime': '727.9', 'train_tokens_per_second': '1957'}
+{'loss': '1.041', 'grad_norm': '1.674', 'learning_rate': '4.249e-05', 'epoch': '0.0851', 'num_input_tokens_seen': 1426759, 'train_runtime': '728.9', 'train_tokens_per_second': '1957'}
+{'loss': '1.064', 'grad_norm': '1.955', 'learning_rate': '4.255e-05', 'epoch': '0.08523', 'num_input_tokens_seen': 1428806, 'train_runtime': '730', 'train_tokens_per_second': '1957'}
+{'loss': '1.219', 'grad_norm': '2.598', 'learning_rate': '4.261e-05', 'epoch': '0.08535', 'num_input_tokens_seen': 1430853, 'train_runtime': '731', 'train_tokens_per_second': '1957'}
+{'loss': '0.4918', 'grad_norm': '1.582', 'learning_rate': '4.267e-05', 'epoch': '0.08547', 'num_input_tokens_seen': 1432900, 'train_runtime': '732', 'train_tokens_per_second': '1957'}
+{'loss': '0.8803', 'grad_norm': '1.222', 'learning_rate': '4.274e-05', 'epoch': '0.08559', 'num_input_tokens_seen': 1434947, 'train_runtime': '733.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.459', 'grad_norm': '1.141', 'learning_rate': '4.28e-05', 'epoch': '0.08571', 'num_input_tokens_seen': 1436994, 'train_runtime': '734.1', 'train_tokens_per_second': '1957'}
+{'loss': '1.191', 'grad_norm': '2.038', 'learning_rate': '4.286e-05', 'epoch': '0.08584', 'num_input_tokens_seen': 1439041, 'train_runtime': '735.2', 'train_tokens_per_second': '1957'}
+{'loss': '1.207', 'grad_norm': '2.139', 'learning_rate': '4.292e-05', 'epoch': '0.08596', 'num_input_tokens_seen': 1441088, 'train_runtime': '736.2', 'train_tokens_per_second': '1957'}
+{'loss': '0.5209', 'grad_norm': '1.371', 'learning_rate': '4.298e-05', 'epoch': '0.08608', 'num_input_tokens_seen': 1443135, 'train_runtime': '737.3', 'train_tokens_per_second': '1957'}
+{'loss': '1.089', 'grad_norm': '1.668', 'learning_rate': '4.304e-05', 'epoch': '0.0862', 'num_input_tokens_seen': 1445182, 'train_runtime': '738.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.9445', 'grad_norm': '1.693', 'learning_rate': '4.31e-05', 'epoch': '0.08632', 'num_input_tokens_seen': 1447229, 'train_runtime': '739.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.7313', 'grad_norm': '1.474', 'learning_rate': '4.316e-05', 'epoch': '0.08645', 'num_input_tokens_seen': 1449276, 'train_runtime': '740.4', 'train_tokens_per_second': '1957'}
+{'loss': '0.5679', 'grad_norm': '1.173', 'learning_rate': '4.322e-05', 'epoch': '0.08657', 'num_input_tokens_seen': 1451323, 'train_runtime': '741.4', 'train_tokens_per_second': '1957'}
+{'loss': '0.7669', 'grad_norm': '1.759', 'learning_rate': '4.328e-05', 'epoch': '0.08669', 'num_input_tokens_seen': 1453370, 'train_runtime': '742.5', 'train_tokens_per_second': '1957'}
+{'loss': '0.7984', 'grad_norm': '1.63', 'learning_rate': '4.335e-05', 'epoch': '0.08681', 'num_input_tokens_seen': 1455417, 'train_runtime': '743.5', 'train_tokens_per_second': '1957'}
+{'loss': '0.8873', 'grad_norm': '1.582', 'learning_rate': '4.341e-05', 'epoch': '0.08694', 'num_input_tokens_seen': 1457464, 'train_runtime': '744.6', 'train_tokens_per_second': '1957'}
+{'loss': '0.9216', 'grad_norm': '1.755', 'learning_rate': '4.347e-05', 'epoch': '0.08706', 'num_input_tokens_seen': 1459511, 'train_runtime': '745.6', 'train_tokens_per_second': '1957'}
+{'loss': '1.021', 'grad_norm': '1.783', 'learning_rate': '4.353e-05', 'epoch': '0.08718', 'num_input_tokens_seen': 1461558, 'train_runtime': '746.7', 'train_tokens_per_second': '1957'}
+{'loss': '0.4473', 'grad_norm': '1.137', 'learning_rate': '4.359e-05', 'epoch': '0.0873', 'num_input_tokens_seen': 1463605, 'train_runtime': '747.7', 'train_tokens_per_second': '1957'}
+{'loss': '0.9687', 'grad_norm': '2.272', 'learning_rate': '4.365e-05', 'epoch': '0.08742', 'num_input_tokens_seen': 1465652, 'train_runtime': '748.7', 'train_tokens_per_second': '1957'}
+{'loss': '1.112', 'grad_norm': '2.072', 'learning_rate': '4.371e-05', 'epoch': '0.08755', 'num_input_tokens_seen': 1467699, 'train_runtime': '749.8', 'train_tokens_per_second': '1957'}
+{'loss': '1.318', 'grad_norm': '2.045', 'learning_rate': '4.377e-05', 'epoch': '0.08767', 'num_input_tokens_seen': 1469746, 'train_runtime': '750.8', 'train_tokens_per_second': '1957'}
+{'loss': '3.067', 'grad_norm': '122.4', 'learning_rate': '4.383e-05', 'epoch': '0.08779', 'num_input_tokens_seen': 1471793, 'train_runtime': '751.9', 'train_tokens_per_second': '1957'}
+{'loss': '0.9648', 'grad_norm': '2.107', 'learning_rate': '4.389e-05', 'epoch': '0.08791', 'num_input_tokens_seen': 1473840, 'train_runtime': '752.9', 'train_tokens_per_second': '1957'}
+{'loss': '1.104', 'grad_norm': '2.104', 'learning_rate': '4.396e-05', 'epoch': '0.08803', 'num_input_tokens_seen': 1475887, 'train_runtime': '754', 'train_tokens_per_second': '1957'}
+{'loss': '0.8007', 'grad_norm': '1.651', 'learning_rate': '4.402e-05', 'epoch': '0.08816', 'num_input_tokens_seen': 1477934, 'train_runtime': '755', 'train_tokens_per_second': '1957'}
+{'loss': '1.484', 'grad_norm': '2.332', 'learning_rate': '4.408e-05', 'epoch': '0.08828', 'num_input_tokens_seen': 1479981, 'train_runtime': '756.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.9249', 'grad_norm': '2.012', 'learning_rate': '4.414e-05', 'epoch': '0.0884', 'num_input_tokens_seen': 1482028, 'train_runtime': '757.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.5576', 'grad_norm': '1.598', 'learning_rate': '4.42e-05', 'epoch': '0.08852', 'num_input_tokens_seen': 1484075, 'train_runtime': '758.2', 'train_tokens_per_second': '1957'}
+{'loss': '0.9556', 'grad_norm': '2.065', 'learning_rate': '4.426e-05', 'epoch': '0.08864', 'num_input_tokens_seen': 1486122, 'train_runtime': '759.2', 'train_tokens_per_second': '1957'}
+{'loss': '0.7808', 'grad_norm': '1.879', 'learning_rate': '4.432e-05', 'epoch': '0.08877', 'num_input_tokens_seen': 1488169, 'train_runtime': '760.3', 'train_tokens_per_second': '1957'}
+{'loss': '0.8358', 'grad_norm': '1.63', 'learning_rate': '4.438e-05', 'epoch': '0.08889', 'num_input_tokens_seen': 1490216, 'train_runtime': '761.3', 'train_tokens_per_second': '1957'}
+{'loss': '1.142', 'grad_norm': '1.875', 'learning_rate': '4.444e-05', 'epoch': '0.08901', 'num_input_tokens_seen': 1492263, 'train_runtime': '762.4', 'train_tokens_per_second': '1957'}
+{'loss': '1.125', 'grad_norm': '1.99', 'learning_rate': '4.451e-05', 'epoch': '0.08913', 'num_input_tokens_seen': 1494310, 'train_runtime': '763.4', 'train_tokens_per_second': '1957'}
+{'loss': '0.4695', 'grad_norm': '1.251', 'learning_rate': '4.457e-05', 'epoch': '0.08926', 'num_input_tokens_seen': 1496357, 'train_runtime': '764.5', 'train_tokens_per_second': '1957'}
+{'loss': '0.5602', 'grad_norm': '1.583', 'learning_rate': '4.463e-05', 'epoch': '0.08938', 'num_input_tokens_seen': 1498404, 'train_runtime': '765.5', 'train_tokens_per_second': '1957'}
+{'loss': '0.5169', 'grad_norm': '1.655', 'learning_rate': '4.469e-05', 'epoch': '0.0895', 'num_input_tokens_seen': 1500451, 'train_runtime': '766.6', 'train_tokens_per_second': '1957'}
+{'loss': '0.4973', 'grad_norm': '1.442', 'learning_rate': '4.475e-05', 'epoch': '0.08962', 'num_input_tokens_seen': 1502498, 'train_runtime': '767.6', 'train_tokens_per_second': '1957'}
+{'loss': '0.9339', 'grad_norm': '1.584', 'learning_rate': '4.481e-05', 'epoch': '0.08974', 'num_input_tokens_seen': 1504545, 'train_runtime': '768.7', 'train_tokens_per_second': '1957'}
+{'loss': '0.5398', 'grad_norm': '1.733', 'learning_rate': '4.487e-05', 'epoch': '0.08987', 'num_input_tokens_seen': 1506592, 'train_runtime': '769.7', 'train_tokens_per_second': '1957'}
+{'loss': '1.076', 'grad_norm': '1.989', 'learning_rate': '4.493e-05', 'epoch': '0.08999', 'num_input_tokens_seen': 1508639, 'train_runtime': '770.8', 'train_tokens_per_second': '1957'}
+{'loss': '0.9521', 'grad_norm': '1.811', 'learning_rate': '4.499e-05', 'epoch': '0.09011', 'num_input_tokens_seen': 1510686, 'train_runtime': '771.8', 'train_tokens_per_second': '1957'}
+{'loss': '1.194', 'grad_norm': '2.602', 'learning_rate': '4.505e-05', 'epoch': '0.09023', 'num_input_tokens_seen': 1512733, 'train_runtime': '772.9', 'train_tokens_per_second': '1957'}
+{'loss': '1.066', 'grad_norm': '2.438', 'learning_rate': '4.512e-05', 'epoch': '0.09035', 'num_input_tokens_seen': 1514780, 'train_runtime': '773.9', 'train_tokens_per_second': '1957'}
+{'loss': '0.5238', 'grad_norm': '1.463', 'learning_rate': '4.518e-05', 'epoch': '0.09048', 'num_input_tokens_seen': 1516827, 'train_runtime': '774.9', 'train_tokens_per_second': '1957'}
+{'loss': '1.135', 'grad_norm': '2.25', 'learning_rate': '4.524e-05', 'epoch': '0.0906', 'num_input_tokens_seen': 1518874, 'train_runtime': '776', 'train_tokens_per_second': '1957'}
+{'loss': '0.8149', 'grad_norm': '2.072', 'learning_rate': '4.53e-05', 'epoch': '0.09072', 'num_input_tokens_seen': 1520921, 'train_runtime': '777', 'train_tokens_per_second': '1957'}
+{'loss': '0.8391', 'grad_norm': '1.97', 'learning_rate': '4.536e-05', 'epoch': '0.09084', 'num_input_tokens_seen': 1522968, 'train_runtime': '778.1', 'train_tokens_per_second': '1957'}
+{'loss': '0.8223', 'grad_norm': '1.574', 'learning_rate': '4.542e-05', 'epoch': '0.09096', 'num_input_tokens_seen': 1525015, 'train_runtime': '779.2', 'train_tokens_per_second': '1957'}
+{'loss': '0.551', 'grad_norm': '1.385', 'learning_rate': '4.548e-05', 'epoch': '0.09109', 'num_input_tokens_seen': 1527062, 'train_runtime': '780.3', 'train_tokens_per_second': '1957'}
+  File "/usr/local/bin/llamafactory-cli", line 8, in <module>
+    sys.exit(main())
+             ^^^^^^
+  File "/workspace/LlamaFactory/src/llamafactory/cli.py", line 24, in main
+    launcher.launch()
+  File "/workspace/LlamaFactory/src/llamafactory/launcher.py", line 157, in launch
+    run_exp()
+  File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 125, in run_exp
+    _training_function(config={"args": args, "callbacks": callbacks})
+  File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 91, in _training_function
+    run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
+  File "/workspace/LlamaFactory/src/llamafactory/train/pt/workflow.py", line 63, in run_pt
+    train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2174, in train
+    return inner_training_loop(
+           ^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2536, in _inner_training_loop
+    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3837, in training_step
+    self.accelerator.backward(loss, **kwargs)
+  File "/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py", line 2740, in backward
+    loss.backward(**kwargs)
+  File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 521, in backward
+    torch.autograd.backward(
+  File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 289, in backward
+    _engine_run_backward(
+  File "/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py", line 769, in _engine_run_backward
+    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+KeyboardInterrupt
diff --git a/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/requirements.txt b/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..71ae727a8e886521d944e8dd5e889d6f4dcaf880
--- /dev/null
+++ b/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/requirements.txt
@@ -0,0 +1,257 @@
+pytz==2025.2
+pydub==0.25.1
+brotli==1.2.0
+antlr4-python3-runtime==4.9.3
+xxhash==3.6.0
+websockets==15.0.1
+tzdata==2025.3
+typing_extensions==4.15.0
+tqdm==4.67.3
+tomlkit==0.13.3
+termcolor==3.3.0
+shtab==1.8.0
+shellingham==1.5.4
+sentencepiece==0.2.1
+semantic-version==2.10.0
+safetensors==0.7.0
+ruff==0.15.0
+regex==2026.1.15
+python-multipart==0.0.22
+pyparsing==3.3.2
+pyarrow==23.0.0
+protobuf==6.33.5
+propcache==0.4.1
+orjson==3.11.7
+omegaconf==2.3.0
+numpy==2.4.2
+multidict==6.7.1
+mdurl==0.1.2
+kiwisolver==1.4.9
+hf-xet==1.2.0
+hf_transfer==0.1.9
+groovy==0.1.2
+frozenlist==1.8.0
+fonttools==4.61.1
+ffmpy==1.0.0
+einops==0.8.2
+docstring_parser==0.17.0
+dill==0.3.8
+cycler==0.12.1
+click==8.3.1
+av==16.0.0
+annotated-types==0.7.0
+annotated-doc==0.0.4
+aiohappyeyeballs==2.6.1
+aiofiles==24.1.0
+yarl==1.22.0
+uvicorn==0.40.0
+typing-inspection==0.4.2
+typer-slim==0.21.1
+tiktoken==0.12.0
+scipy==1.17.0
+pydantic_core==2.41.4
+pandas==2.3.3
+multiprocess==0.70.16
+modelscope==1.34.0
+markdown-it-py==4.0.0
+fire==0.7.1
+contourpy==1.3.3
+anyio==4.12.1
+aiosignal==1.4.0
+starlette==0.50.0
+rich==14.3.2
+pydantic==2.12.3
+matplotlib==3.10.8
+aiohttp==3.13.3
+tyro==0.8.14
+typer==0.21.1
+torchdata==0.11.0
+sse-starlette==3.2.0
+safehttpx==0.1.7
+huggingface_hub==1.3.7
+fastapi==0.128.0
+tokenizers==0.22.2
+gradio_client==1.14.0
+datasets==4.0.0
+accelerate==1.11.0
+transformers==5.0.0
+gradio==5.50.0
+trl==0.24.0
+peft==0.18.1
+jieba==0.42.1
+rouge-chinese==1.0.3
+joblib==1.5.3
+nltk==3.9.2
+llamafactory==0.9.5.dev0
+py-cpuinfo==9.0.0
+nvidia-ml-py==13.590.48
+hjson==3.1.0
+ninja==1.13.0
+msgpack==1.1.2
+deepspeed==0.16.9
+smmap==5.0.2
+sentry-sdk==2.51.0
+gitdb==4.0.12
+GitPython==3.1.46
+wandb==0.24.1
+entrypoints==0.4
+jupyter_client==7.4.9
+nbclassic==1.1.0
+notebook==6.5.5
+pyzmq==24.0.1
+PyYAML==6.0.2
+Send2Trash==1.8.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+comm==0.2.2
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+executing==2.1.0
+fastjsonschema==2.20.0
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+lxml==5.3.0
+matplotlib-inline==0.1.7
+mistune==3.0.2
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+packaging==24.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+referencing==0.35.1
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.20.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.3.0
+tornado==6.4.1
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20240906
+uri-template==1.3.0
+urllib3==2.2.3
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+filelock==3.13.1
+fsspec==2024.2.0
+mpmath==1.3.0
+networkx==3.2.1
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+pillow==10.2.0
+sympy==1.12
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+triton==3.0.0
+pip==24.2
+setuptools==75.1.0
+wheel==0.44.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+SecretStorage==3.3.1
+blinker==1.4
+cryptography==3.4.8
+dbus-python==1.2.18
+distro==1.7.0
+httplib2==0.20.2
+importlib-metadata==4.6.4
+jeepney==0.7.1
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+more-itertools==8.10.0
+oauthlib==3.2.0
+python-apt==2.4.0+ubuntu4
+six==1.16.0
+wadllib==1.3.6
+zipp==1.0.0
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+importlib_resources==6.4.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.1
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.43.0
+zipp==3.19.2
diff --git a/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/wandb-metadata.json b/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..be9607b210717b790a552e83bc7103dbbfa9a907
--- /dev/null
+++ b/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/wandb-metadata.json
@@ -0,0 +1,41 @@
+{
+  "os":  "Linux-6.8.0-90-generic-x86_64-with-glibc2.35",
+  "python":  "CPython 3.11.10",
+  "startedAt":  "2026-02-04T03:49:47.693011Z",
+  "args":  [
+    "/workspace/v127rc_exp1/B.yaml"
+  ],
+  "program":  "/usr/local/bin/llamafactory-cli",
+  "git":  {
+    "remote":  "https://github.com/hiyouga/LlamaFactory.git",
+    "commit":  "1a02717fa84c270d1c156c4c4a391c2f95525a63"
+  },
+  "email":  "markmochi200@gmail.com",
+  "root":  "/workspace/LlamaFactory",
+  "host":  "34f54978776c",
+  "executable":  "/usr/bin/python",
+  "cpu_count":  24,
+  "cpu_count_logical":  48,
+  "gpu":  "NVIDIA GeForce RTX 4090",
+  "gpu_count":  1,
+  "disk":  {
+    "/":  {
+      "total":  "21474836480",
+      "used":  "1931444224"
+    }
+  },
+  "memory":  {
+    "total":  "405012275200"
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA GeForce RTX 4090",
+      "memoryTotal":  "25757220864",
+      "cudaCores":  16384,
+      "architecture":  "Ada",
+      "uuid":  "GPU-acb5171c-45e7-5653-1120-9d0cd2a192a6"
+    }
+  ],
+  "cudaVersion":  "12.8",
+  "writerId":  "w3ltcjzxbduoqrmb4bdrwr1550cgr9rj"
+}
\ No newline at end of file
diff --git a/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/wandb-summary.json b/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..9661f67b71c748b2b4cb1bd68485f0cc392c8c88
--- /dev/null
+++ b/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/files/wandb-summary.json
@@ -0,0 +1 @@
+{"train/learning_rate":4.548229548229548e-05,"train/num_input_tokens_seen":1527062,"train/train_tokens_per_second":1957.127,"train/epoch":0.09108669108669108,"_wandb":{"runtime":779},"train/loss":0.5510396361351013,"train_runtime":780.2571,"_step":745,"_runtime":779,"train/global_step":746,"_timestamp":1.7701777675383866e+09,"train/grad_norm":1.3848786354064941}
\ No newline at end of file
diff --git a/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/logs/debug-internal.log b/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..f34cd3e4e477cd48813c887542b6384ed456d3a6
--- /dev/null
+++ b/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/logs/debug-internal.log
@@ -0,0 +1,11 @@
+{"time":"2026-02-04T03:49:47.944132466Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
+{"time":"2026-02-04T03:49:48.310719342Z","level":"INFO","msg":"stream: created new stream","id":"l0qo8o9h"}
+{"time":"2026-02-04T03:49:48.31150827Z","level":"INFO","msg":"handler: started","stream_id":"l0qo8o9h"}
+{"time":"2026-02-04T03:49:48.313047258Z","level":"INFO","msg":"stream: started","id":"l0qo8o9h"}
+{"time":"2026-02-04T03:49:48.313056693Z","level":"INFO","msg":"writer: started","stream_id":"l0qo8o9h"}
+{"time":"2026-02-04T03:49:48.313065576Z","level":"INFO","msg":"sender: started","stream_id":"l0qo8o9h"}
+{"time":"2026-02-04T04:02:48.517900902Z","level":"INFO","msg":"stream: closing","id":"l0qo8o9h"}
+{"time":"2026-02-04T04:02:49.597886794Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2026-02-04T04:02:49.807668241Z","level":"INFO","msg":"handler: closed","stream_id":"l0qo8o9h"}
+{"time":"2026-02-04T04:02:49.812070815Z","level":"INFO","msg":"sender: closed","stream_id":"l0qo8o9h"}
+{"time":"2026-02-04T04:02:49.812478333Z","level":"INFO","msg":"stream: closed","id":"l0qo8o9h"}
diff --git a/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/logs/debug.log b/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..d1b88fe276e37f8adb503d2e342ed30f034ade6b
--- /dev/null
+++ b/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/logs/debug.log
@@ -0,0 +1,259 @@
+2026-02-04 03:49:47,718 INFO    MainThread:4648 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
+2026-02-04 03:49:47,718 INFO    MainThread:4648 [wandb_setup.py:_flush():81] Configure stats pid to 4648
+2026-02-04 03:49:47,718 INFO    MainThread:4648 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-02-04 03:49:47,719 INFO    MainThread:4648 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/logs/debug.log
+2026-02-04 03:49:47,720 INFO    MainThread:4648 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_034947-l0qo8o9h/logs/debug-internal.log
+2026-02-04 03:49:47,720 INFO    MainThread:4648 [wandb_init.py:init():844] calling init triggers
+2026-02-04 03:49:47,720 INFO    MainThread:4648 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2026-02-04 03:49:47,721 INFO    MainThread:4648 [wandb_init.py:init():892] starting backend
+2026-02-04 03:49:47,935 INFO    MainThread:4648 [wandb_init.py:init():895] sending inform_init request
+2026-02-04 03:49:47,941 INFO    MainThread:4648 [wandb_init.py:init():903] backend started and connected
+2026-02-04 03:49:47,943 INFO    MainThread:4648 [wandb_init.py:init():973] updated telemetry
+2026-02-04 03:49:47,985 INFO    MainThread:4648 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
+2026-02-04 03:49:48,735 INFO    MainThread:4648 [wandb_init.py:init():1042] starting run threads in backend
+2026-02-04 03:49:48,819 INFO    MainThread:4648 [wandb_run.py:_console_start():2529] atexit reg
+2026-02-04 03:49:48,820 INFO    MainThread:4648 [wandb_run.py:_redirect():2377] redirect: wrap_raw
+2026-02-04 03:49:48,820 INFO    MainThread:4648 [wandb_run.py:_redirect():2446] Wrapping output streams.
+2026-02-04 03:49:48,821 INFO    MainThread:4648 [wandb_run.py:_redirect():2469] Redirects installed.
+2026-02-04 03:49:48,823 INFO    MainThread:4648 [wandb_init.py:init():1082] run started, returning control to user process
+2026-02-04 03:49:48,824 INFO    MainThread:4648 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['down_proj', 'k_proj', 'v_proj', 'q_proj', 'gate_proj', 'up_proj', 'o_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/B', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
+2026-02-04 03:49:48,830 INFO    MainThread:4648 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7fd00848c510>>
+2026-02-04 03:49:48,831 INFO    MainThread:4648 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
+2026-02-04 03:49:48,833 INFO    MainThread:4648 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t35_d0_r286'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
+2026-02-04 04:02:48,517 INFO    wandb-AsyncioManager-main:4648 [service_client.py:_forward_responses():94] Reached EOF.
+2026-02-04 04:02:48,518 INFO    wandb-AsyncioManager-main:4648 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.
+2026-02-04 04:02:49,842 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,845 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,847 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,849 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,854 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,855 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,856 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,858 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,858 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,859 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,859 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,860 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,861 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,863 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,863 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,863 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,864 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
+2026-02-04 04:02:49,865 ERROR   wandb-AsyncioManager-main:4648 [asyncio_manager.py:fn_wrap_exceptions():183] Uncaught exception in run_soon callback.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/asyncio_manager.py", line 181, in fn_wrap_exceptions
+    await fn()
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 38, in publish
+    await self._send_server_request(request)
+  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/service/service_client.py", line 64, in _send_server_request
+    await self._writer.drain()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 392, in drain
+    await self._protocol._drain_helper()
+  File "/usr/lib/python3.11/asyncio/streams.py", line 166, in _drain_helper
+    raise ConnectionResetError('Connection lost')
+ConnectionResetError: Connection lost
diff --git a/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/requirements.txt b/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..250b7ca7baef222ee78261c629b347d5f4fe7859
--- /dev/null
+++ b/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/requirements.txt
@@ -0,0 +1,257 @@
+pytz==2025.2
+pydub==0.25.1
+brotli==1.2.0
+antlr4-python3-runtime==4.9.3
+xxhash==3.6.0
+websockets==15.0.1
+tzdata==2025.3
+typing_extensions==4.15.0
+tqdm==4.67.3
+tomlkit==0.13.3
+termcolor==3.3.0
+shtab==1.8.0
+shellingham==1.5.4
+sentencepiece==0.2.1
+semantic-version==2.10.0
+safetensors==0.7.0
+ruff==0.15.0
+regex==2026.1.15
+python-multipart==0.0.22
+pyparsing==3.3.2
+pyarrow==23.0.0
+protobuf==6.33.5
+propcache==0.4.1
+orjson==3.11.7
+omegaconf==2.3.0
+numpy==2.4.2
+multidict==6.7.1
+mdurl==0.1.2
+kiwisolver==1.4.9
+hf-xet==1.2.0
+hf_transfer==0.1.9
+groovy==0.1.2
+frozenlist==1.8.0
+fonttools==4.61.1
+ffmpy==1.0.0
+einops==0.8.2
+docstring_parser==0.17.0
+dill==0.3.8
+cycler==0.12.1
+click==8.3.1
+av==16.0.0
+annotated-types==0.7.0
+annotated-doc==0.0.4
+aiohappyeyeballs==2.6.1
+aiofiles==24.1.0
+yarl==1.22.0
+uvicorn==0.40.0
+typing-inspection==0.4.2
+typer-slim==0.21.1
+tiktoken==0.12.0
+scipy==1.17.0
+pydantic_core==2.41.4
+pandas==2.3.3
+multiprocess==0.70.16
+modelscope==1.34.0
+markdown-it-py==4.0.0
+fire==0.7.1
+contourpy==1.3.3
+anyio==4.12.1
+aiosignal==1.4.0
+starlette==0.50.0
+rich==14.3.2
+pydantic==2.12.3
+matplotlib==3.10.8
+aiohttp==3.13.3
+tyro==0.8.14
+typer==0.21.1
+torchdata==0.11.0
+sse-starlette==3.2.0
+safehttpx==0.1.7
+huggingface_hub==1.3.7
+fastapi==0.128.0
+tokenizers==0.22.2
+gradio_client==1.14.0
+datasets==4.0.0
+accelerate==1.11.0
+transformers==5.0.0
+gradio==5.50.0
+trl==0.24.0
+peft==0.18.1
+llamafactory==0.9.5.dev0
+jieba==0.42.1
+rouge-chinese==1.0.3
+joblib==1.5.3
+nltk==3.9.2
+py-cpuinfo==9.0.0
+nvidia-ml-py==13.590.48
+hjson==3.1.0
+ninja==1.13.0
+msgpack==1.1.2
+deepspeed==0.16.9
+smmap==5.0.2
+sentry-sdk==2.51.0
+gitdb==4.0.12
+GitPython==3.1.46
+wandb==0.24.1
+entrypoints==0.4
+jupyter_client==7.4.9
+nbclassic==1.1.0
+notebook==6.5.5
+pyzmq==24.0.1
+PyYAML==6.0.2
+Send2Trash==1.8.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+comm==0.2.2
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+executing==2.1.0
+fastjsonschema==2.20.0
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+lxml==5.3.0
+matplotlib-inline==0.1.7
+mistune==3.0.2
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+packaging==24.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+referencing==0.35.1
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.20.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.3.0
+tornado==6.4.1
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20240906
+uri-template==1.3.0
+urllib3==2.2.3
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+filelock==3.13.1
+fsspec==2024.2.0
+mpmath==1.3.0
+networkx==3.2.1
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+pillow==10.2.0
+sympy==1.12
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+triton==3.0.0
+pip==24.2
+setuptools==75.1.0
+wheel==0.44.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+SecretStorage==3.3.1
+blinker==1.4
+cryptography==3.4.8
+dbus-python==1.2.18
+distro==1.7.0
+httplib2==0.20.2
+importlib-metadata==4.6.4
+jeepney==0.7.1
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+more-itertools==8.10.0
+oauthlib==3.2.0
+python-apt==2.4.0+ubuntu4
+six==1.16.0
+wadllib==1.3.6
+zipp==1.0.0
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+importlib_resources==6.4.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.1
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.43.0
+zipp==3.19.2
diff --git a/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/logs/debug-internal.log b/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..3a766f09390ec63481049c55ec163b84f23f8559
--- /dev/null
+++ b/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/logs/debug-internal.log
@@ -0,0 +1,11 @@
+{"time":"2026-02-04T03:57:46.416372196Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
+{"time":"2026-02-04T03:57:46.753451309Z","level":"INFO","msg":"stream: created new stream","id":"cloyjeo5"}
+{"time":"2026-02-04T03:57:46.75405663Z","level":"INFO","msg":"handler: started","stream_id":"cloyjeo5"}
+{"time":"2026-02-04T03:57:46.755686921Z","level":"INFO","msg":"stream: started","id":"cloyjeo5"}
+{"time":"2026-02-04T03:57:46.755706657Z","level":"INFO","msg":"writer: started","stream_id":"cloyjeo5"}
+{"time":"2026-02-04T03:57:46.755717207Z","level":"INFO","msg":"sender: started","stream_id":"cloyjeo5"}
+{"time":"2026-02-04T04:04:23.302284935Z","level":"INFO","msg":"stream: closing","id":"cloyjeo5"}
+{"time":"2026-02-04T04:04:24.501401508Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2026-02-04T04:04:24.748850215Z","level":"INFO","msg":"handler: closed","stream_id":"cloyjeo5"}
+{"time":"2026-02-04T04:04:24.752804101Z","level":"INFO","msg":"sender: closed","stream_id":"cloyjeo5"}
+{"time":"2026-02-04T04:04:24.753287447Z","level":"INFO","msg":"stream: closed","id":"cloyjeo5"}
diff --git a/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/logs/debug.log b/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..468a2cdf71460959dd6ca0ea372f973994fe73b6
--- /dev/null
+++ b/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/logs/debug.log
@@ -0,0 +1,25 @@
+2026-02-04 03:57:46,186 INFO    MainThread:1791 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
+2026-02-04 03:57:46,187 INFO    MainThread:1791 [wandb_setup.py:_flush():81] Configure stats pid to 1791
+2026-02-04 03:57:46,187 INFO    MainThread:1791 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-02-04 03:57:46,188 INFO    MainThread:1791 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/logs/debug.log
+2026-02-04 03:57:46,188 INFO    MainThread:1791 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/logs/debug-internal.log
+2026-02-04 03:57:46,189 INFO    MainThread:1791 [wandb_init.py:init():844] calling init triggers
+2026-02-04 03:57:46,189 INFO    MainThread:1791 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2026-02-04 03:57:46,190 INFO    MainThread:1791 [wandb_init.py:init():892] starting backend
+2026-02-04 03:57:46,408 INFO    MainThread:1791 [wandb_init.py:init():895] sending inform_init request
+2026-02-04 03:57:46,415 INFO    MainThread:1791 [wandb_init.py:init():903] backend started and connected
+2026-02-04 03:57:46,416 INFO    MainThread:1791 [wandb_init.py:init():973] updated telemetry
+2026-02-04 03:57:46,466 INFO    MainThread:1791 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
+2026-02-04 03:57:47,132 INFO    MainThread:1791 [wandb_init.py:init():1042] starting run threads in backend
+2026-02-04 03:57:47,201 INFO    MainThread:1791 [wandb_run.py:_console_start():2529] atexit reg
+2026-02-04 03:57:47,206 INFO    MainThread:1791 [wandb_run.py:_redirect():2377] redirect: wrap_raw
+2026-02-04 03:57:47,208 INFO    MainThread:1791 [wandb_run.py:_redirect():2446] Wrapping output streams.
+2026-02-04 03:57:47,208 INFO    MainThread:1791 [wandb_run.py:_redirect():2469] Redirects installed.
+2026-02-04 03:57:47,210 INFO    MainThread:1791 [wandb_init.py:init():1082] run started, returning control to user process
+2026-02-04 03:57:47,211 INFO    MainThread:1791 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['up_proj', 'q_proj', 'gate_proj', 'k_proj', 'v_proj', 'o_proj', 'down_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/C', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
+2026-02-04 03:57:47,217 INFO    MainThread:1791 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x744df6f36e10>>
+2026-02-04 03:57:47,218 INFO    MainThread:1791 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
+2026-02-04 03:57:47,220 INFO    MainThread:1791 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d35_r286'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
+2026-02-04 04:04:23,301 INFO    wandb-AsyncioManager-main:1791 [service_client.py:_forward_responses():94] Reached EOF.
+2026-02-04 04:04:23,308 INFO    wandb-AsyncioManager-main:1791 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.